1 Added simple tests
[clrlibru.git] / clrlibru.c
index 825cb65..76b44f9 100644 (file)
@@ -32,6 +32,8 @@
 #include <string.h>
 #include <ctype.h>
 
+static void pushoutstr(FILE *out, char *buf, int len);
+
 static void
 usage() {
         printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n");
@@ -41,12 +43,15 @@ usage() {
         exit(0);
 }
 
-char* RemoveTag[]={
+static char* RemoveTag[]={
        "select",
        "head",
        "div",
        "a",
        "form",
+       "script",
+       "style",
+       "title",
        NULL
 };
 
@@ -63,6 +68,43 @@ is_rtag(char *tag, int len) {
        return 0;
 }
 
+typedef struct {
+       char    *quote;
+       char    *str;
+       int     len;
+} Quote;
+
+static Quote quoteChange[] = {
+       {"quot", "\"", -1},
+       {"laquo", "\"", -1},
+       {"raquo", "\"", -1},
+       {"quot", "\"", -1},
+       {"lt", "<", -1},
+       {"gt", ">", -1},
+       {"nbsp", " ", -1},
+       {"mdash", "-", -1},
+       {"amp", "&", -1},
+       {"shy", "", -1},
+       {NULL, NULL, -1}
+}; 
+
+static int
+pushoutquot(FILE *out, char *buf, int buflen) {
+       Quote   *ptr = quoteChange;
+
+       buf[buflen]='\0';
+       while( ptr->quote ) {
+               if ( strcmp( ptr->quote, buf ) == 0 ) {
+                       if ( ptr->len < 0 ) 
+                               ptr->len = strlen( ptr->str );
+                       pushoutstr( out, ptr->str, ptr->len );
+                       return 1;
+               }
+               ptr++;
+       }
+       return 0;
+}
+
 
 static char *optarg = NULL;
 static int current=1;
@@ -98,21 +140,85 @@ mgetopt(int argn, char* argv[], char *option) {
        return (int)key;
 }
 
-#define INTXT  0
-#define INTAG  1
-#define FINDEND        2
+#define INTXT          0
+#define INTAG          1
+#define FINDEND                2
 #define INDROPTAG      3
 #define INDROPINTAG    4
 #define INHEADTAG      5
 #define INDROPINCLSTAG 6
+#define WAITAFTERRED   7
+#define COMMENTBEGIN1  8       
+#define COMMENTBEGIN2  9       
+#define COMMENTIN      10      
+#define COMMENTEND1    11      
+#define COMMENTEND2    12      
+#define        INQUOTE         13
 
+#define        BUFFERLENGTH    8192
+#define REDSTRING      "  "
+
+typedef enum TypeOut {
+       Char,
+       NewLine,
+       Tag,
+       Paragraph,
+       None
+} TypeOut;
+
+static void
+pushout( FILE *out, TypeOut type, int value ) {
+       static TypeOut PrevType=None;
+       static int  prevvalue=0;
+       static int newlinecount=0;
+
+       if ( type == Char ) {
+               if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { 
+                       newlinecount=0;
+                       fputc(value ,out);
+               }
+               prevvalue = value;
+       } else if ( type == NewLine ) {
+               if ( newlinecount < 2 ) 
+                       fputc('\n', out);
+               newlinecount++;
+       } else if ( type != PrevType ) {
+               switch(type) {
+                       case Tag:
+                               if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n'  ) )) ) { 
+                                       newlinecount=0;
+                                       fputc(' ', out);
+                               }
+                               break;
+                       case Paragraph:
+                               pushout(out, NewLine, 0);
+                               fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); 
+                               newlinecount=0;
+                               break;
+                       default:
+                               printf("Unknown type: %d", type);
+                               exit(1);
+               }
+       }
+       PrevType = type;
+}
+
+static void
+pushoutstr(FILE *out, char *buf, int len) {
+       char *ptr=buf;
+       while( ptr-buf<len ) {
+               pushout(out, Char, (int)(*ptr));
+               ptr++;
+       } 
+}
 
 int 
 main(int argn, char *argv[]) {
        int ch;
        FILE    *in=stdin, *out=stdout;
        int state=INTXT;
-       char    buf[8192];
+       char    buf[BUFFERLENGTH];
        int lenbuf=0,closelen=0;
         int spacelen=4;
 
@@ -132,8 +238,8 @@ main(int argn, char *argv[]) {
                                break;
                        case 'l':
                                spacelen = atoi(optarg);
-                               if ( spacelen < 0 ) {
-                                       printf("-l should be >= 0\n");
+                               if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) {
+                                       printf("-l should be >= 0 and < %d\n", BUFFERLENGTH);
                                        exit(1);
                                }
                                break;
@@ -145,6 +251,9 @@ main(int argn, char *argv[]) {
        }
        
        while( (ch=getc(in)) != EOF ) {
+               if ( ch == '\r' )
+                       continue;
+
                if ( state==INTXT ) {
                        if ( ch == '<' ) {
                                state=INHEADTAG;
@@ -152,23 +261,35 @@ main(int argn, char *argv[]) {
                        } else if ( ch == '\n' ) {
                                state=FINDEND;
                                lenbuf=1;
-                               *buf = ch;
-                       } else if ( ch != '\r' )
-                               fputc(ch,out);
+                       } else if ( ch=='&' ) {
+                               *buf='&';
+                               lenbuf=1;       
+                               state=INQUOTE;
+                       } else { 
+                               pushout(out, Char, ch);
+                       }
                } else if ( state==INHEADTAG ) {
                        if ( isalpha(ch) ) {
-                               buf[ lenbuf ] = tolower(ch);
-                               lenbuf++;
+                               if ( lenbuf < BUFFERLENGTH-1 ) { 
+                                       buf[ lenbuf ] = tolower(ch);
+                                       lenbuf++;
+                               }
+                       } else if ( ch == '!' ) {
+                               state = COMMENTBEGIN1;
                        } else if ( ch == '>' ) {
                                if ( is_rtag(buf,lenbuf) ) {
                                        state = INDROPTAG;
                                        closelen=0;
                                } else {
                                        state=INTXT;
-                                       fputc(' ',out);
+                                       if ( lenbuf==0 )
+                                               pushoutstr(out, "<>", 2);
+                                       else
+                                               pushout(out, Tag, 0);
                                }
                        } else if ( lenbuf == 0 && ch != '/' ) {
-                               fputc('<',out); fputc(ch,out); 
+                               pushout(out, Char, '<');
+                               pushout(out, Char, ch);
                                state=INTXT;
                        } else { 
                                if ( is_rtag(buf,lenbuf) ) {
@@ -176,14 +297,13 @@ main(int argn, char *argv[]) {
                                        closelen=0;
                                } else {
                                        state=INTAG;
-                                       fputc(' ',out);
                                }
                        }
                } else if ( state==INTAG ) {
                        if ( ch == '>' ) {
                                state=INTXT;
-                               fputc(' ',out);
-                       }       
+                               pushout(out, Tag, 0);
+                       }
                } else if ( state == INDROPTAG ) {
                        if ( ch == '<' ) {
                                state=INDROPINTAG;
@@ -205,28 +325,68 @@ main(int argn, char *argv[]) {
                        } else
                                state=INDROPTAG;
                } else if ( state==FINDEND ) {
-                       if ( ch == ' ' ) {
-                               buf[ lenbuf ] = ch;
+                       if ( ch == ' ' || ch == '\t' ) {
                                lenbuf++;
                                if ( lenbuf > spacelen ) {
-                                       fwrite(buf, sizeof(char), lenbuf, out);
-                                       state=INTXT;
+                                       pushout( out, Paragraph, 0 );
+                                       state=WAITAFTERRED;
                                }
                        } else if ( ch=='\n' ) {
-                               buf[ lenbuf ] = ch;
+                               pushout( out, NewLine, 0 );
+                               pushout( out, NewLine, 0 );
                                lenbuf++;
-                               fwrite(buf, sizeof(char), lenbuf, out);
+                       } else {
                                state=INTXT;
-                       } else if ( ch !='\r' ) {
+                               pushout(out, Char, ' ');
+                               ungetc(ch,in);
+                       }
+               } else if ( state==WAITAFTERRED ) {
+                       if ( !isspace(ch) ) {
+                               ungetc(ch,in);
                                state=INTXT;
-                               fputc(' ',out);
+                       }
+               } else if ( state==COMMENTBEGIN1 ) {
+                       if ( ch == '-' ) {
+                               state = COMMENTBEGIN2;
+                       } else {
+                               pushoutstr(out, "<!", 2);
+                               ungetc(ch,in);
+                               state=INTXT;
+                       }
+               } else if ( state==COMMENTBEGIN2 ) {
+                       if ( ch == '-' ) {
+                               state = COMMENTIN;
+                       } else {
+                               pushoutstr(out, "<!-", 2);
+                               ungetc(ch,in);
+                               state=INTXT;
+                       }
+               } else if ( state==COMMENTIN ) {
+                       if ( ch == '-' ) 
+                               state = COMMENTEND1;
+               } else if ( state==COMMENTEND1 ) {
+                       state = ( ch == '-' ) ? COMMENTEND2 : COMMENTIN;
+               } else if ( state==COMMENTEND2 ) {
+                       if ( ch == '>' )
+                               state = INTXT;
+                       else if ( ch != '-' )
+                               state = COMMENTIN;
+               } else if ( state==INQUOTE ) {
+                       if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) {
+                               buf[ lenbuf ] = ch;
+                               lenbuf++;
+                       } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) {
+                               state = INTXT;
+                       } else {
+                               pushoutstr(out, buf, lenbuf);
+                               state = INTXT;
                                ungetc(ch,in);
-                       } 
+                       }
                } else {
                        printf("Unknown state: %d\n", state);
                        exit(1);
                }
-       } 
+       }
                        
        if ( in!=stdin )
                fclose(in);