1 Added simple tests
authorteodor <teodor>
Thu, 4 Aug 2005 18:11:00 +0000 (18:11 +0000)
committerteodor <teodor>
Thu, 4 Aug 2005 18:11:00 +0000 (18:11 +0000)
2 Add removing comment, style etc
3 Improve formatting
4 Add quot transformation

Makefile
clrlibru.c
expected/test.htm [new file with mode: 0644]
tests/test.htm [new file with mode: 0644]

index bf32e85..04e3721 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,19 @@ all: clrlibru
 clrlibru: $(OBJS)
        $(CC) $(LIB) -o $@ $(OBJS)
 
+test: all
+       @[ -d results ] || mkdir results
+       @[ -d diffs ] || mkdir diffs
+       @for FILE in  test.htm  ; do \
+               echo -n $$FILE "        ........ " ; \
+               if ./clrlibru < tests/$$FILE > results/$$FILE && diff -c expected/$$FILE results/$$FILE > diffs/$$FILE ; then \
+                       echo ok ; \
+               else \
+                       echo FAILED ; \
+               fi ; \
+       done 
+
 clean:
        rm -rf clrlibru *core *.o
+       rm -rf results diffs
 
index 825cb65..76b44f9 100644 (file)
@@ -32,6 +32,8 @@
 #include <string.h>
 #include <ctype.h>
 
+static void pushoutstr(FILE *out, char *buf, int len);
+
 static void
 usage() {
         printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n");
@@ -41,12 +43,15 @@ usage() {
         exit(0);
 }
 
-char* RemoveTag[]={
+static char* RemoveTag[]={
        "select",
        "head",
        "div",
        "a",
        "form",
+       "script",
+       "style",
+       "title",
        NULL
 };
 
@@ -63,6 +68,43 @@ is_rtag(char *tag, int len) {
        return 0;
 }
 
+typedef struct {
+       char    *quote;
+       char    *str;
+       int     len;
+} Quote;
+
+static Quote quoteChange[] = {
+       {"quot", "\"", -1},
+       {"laquo", "\"", -1},
+       {"raquo", "\"", -1},
+       {"quot", "\"", -1},
+       {"lt", "<", -1},
+       {"gt", ">", -1},
+       {"nbsp", " ", -1},
+       {"mdash", "-", -1},
+       {"amp", "&", -1},
+       {"shy", "", -1},
+       {NULL, NULL, -1}
+}; 
+
+static int
+pushoutquot(FILE *out, char *buf, int buflen) {
+       Quote   *ptr = quoteChange;
+
+       buf[buflen]='\0';
+       while( ptr->quote ) {
+               if ( strcmp( ptr->quote, buf ) == 0 ) {
+                       if ( ptr->len < 0 ) 
+                               ptr->len = strlen( ptr->str );
+                       pushoutstr( out, ptr->str, ptr->len );
+                       return 1;
+               }
+               ptr++;
+       }
+       return 0;
+}
+
 
 static char *optarg = NULL;
 static int current=1;
@@ -98,21 +140,85 @@ mgetopt(int argn, char* argv[], char *option) {
        return (int)key;
 }
 
-#define INTXT  0
-#define INTAG  1
-#define FINDEND        2
+#define INTXT          0
+#define INTAG          1
+#define FINDEND                2
 #define INDROPTAG      3
 #define INDROPINTAG    4
 #define INHEADTAG      5
 #define INDROPINCLSTAG 6
+#define WAITAFTERRED   7
+#define COMMENTBEGIN1  8       
+#define COMMENTBEGIN2  9       
+#define COMMENTIN      10      
+#define COMMENTEND1    11      
+#define COMMENTEND2    12      
+#define        INQUOTE         13
 
+#define        BUFFERLENGTH    8192
+#define REDSTRING      "  "
+
+typedef enum TypeOut {
+       Char,
+       NewLine,
+       Tag,
+       Paragraph,
+       None
+} TypeOut;
+
+static void
+pushout( FILE *out, TypeOut type, int value ) {
+       static TypeOut PrevType=None;
+       static int  prevvalue=0;
+       static int newlinecount=0;
+
+       if ( type == Char ) {
+               if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { 
+                       newlinecount=0;
+                       fputc(value ,out);
+               }
+               prevvalue = value;
+       } else if ( type == NewLine ) {
+               if ( newlinecount < 2 ) 
+                       fputc('\n', out);
+               newlinecount++;
+       } else if ( type != PrevType ) {
+               switch(type) {
+                       case Tag:
+                               if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n'  ) )) ) { 
+                                       newlinecount=0;
+                                       fputc(' ', out);
+                               }
+                               break;
+                       case Paragraph:
+                               pushout(out, NewLine, 0);
+                               fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); 
+                               newlinecount=0;
+                               break;
+                       default:
+                               printf("Unknown type: %d", type);
+                               exit(1);
+               }
+       }
+       PrevType = type;
+}
+
+static void
+pushoutstr(FILE *out, char *buf, int len) {
+       char *ptr=buf;
+       while( ptr-buf<len ) {
+               pushout(out, Char, (int)(*ptr));
+               ptr++;
+       } 
+}
 
 int 
 main(int argn, char *argv[]) {
        int ch;
        FILE    *in=stdin, *out=stdout;
        int state=INTXT;
-       char    buf[8192];
+       char    buf[BUFFERLENGTH];
        int lenbuf=0,closelen=0;
         int spacelen=4;
 
@@ -132,8 +238,8 @@ main(int argn, char *argv[]) {
                                break;
                        case 'l':
                                spacelen = atoi(optarg);
-                               if ( spacelen < 0 ) {
-                                       printf("-l should be >= 0\n");
+                               if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) {
+                                       printf("-l should be >= 0 and < %d\n", BUFFERLENGTH);
                                        exit(1);
                                }
                                break;
@@ -145,6 +251,9 @@ main(int argn, char *argv[]) {
        }
        
        while( (ch=getc(in)) != EOF ) {
+               if ( ch == '\r' )
+                       continue;
+
                if ( state==INTXT ) {
                        if ( ch == '<' ) {
                                state=INHEADTAG;
@@ -152,23 +261,35 @@ main(int argn, char *argv[]) {
                        } else if ( ch == '\n' ) {
                                state=FINDEND;
                                lenbuf=1;
-                               *buf = ch;
-                       } else if ( ch != '\r' )
-                               fputc(ch,out);
+                       } else if ( ch=='&' ) {
+                               *buf='&';
+                               lenbuf=1;       
+                               state=INQUOTE;
+                       } else { 
+                               pushout(out, Char, ch);
+                       }
                } else if ( state==INHEADTAG ) {
                        if ( isalpha(ch) ) {
-                               buf[ lenbuf ] = tolower(ch);
-                               lenbuf++;
+                               if ( lenbuf < BUFFERLENGTH-1 ) { 
+                                       buf[ lenbuf ] = tolower(ch);
+                                       lenbuf++;
+                               }
+                       } else if ( ch == '!' ) {
+                               state = COMMENTBEGIN1;
                        } else if ( ch == '>' ) {
                                if ( is_rtag(buf,lenbuf) ) {
                                        state = INDROPTAG;
                                        closelen=0;
                                } else {
                                        state=INTXT;
-                                       fputc(' ',out);
+                                       if ( lenbuf==0 )
+                                               pushoutstr(out, "<>", 2);
+                                       else
+                                               pushout(out, Tag, 0);
                                }
                        } else if ( lenbuf == 0 && ch != '/' ) {
-                               fputc('<',out); fputc(ch,out); 
+                               pushout(out, Char, '<');
+                               pushout(out, Char, ch);
                                state=INTXT;
                        } else { 
                                if ( is_rtag(buf,lenbuf) ) {
@@ -176,14 +297,13 @@ main(int argn, char *argv[]) {
                                        closelen=0;
                                } else {
                                        state=INTAG;
-                                       fputc(' ',out);
                                }
                        }
                } else if ( state==INTAG ) {
                        if ( ch == '>' ) {
                                state=INTXT;
-                               fputc(' ',out);
-                       }       
+                               pushout(out, Tag, 0);
+                       }
                } else if ( state == INDROPTAG ) {
                        if ( ch == '<' ) {
                                state=INDROPINTAG;
@@ -205,28 +325,68 @@ main(int argn, char *argv[]) {
                        } else
                                state=INDROPTAG;
                } else if ( state==FINDEND ) {
-                       if ( ch == ' ' ) {
-                               buf[ lenbuf ] = ch;
+                       if ( ch == ' ' || ch == '\t' ) {
                                lenbuf++;
                                if ( lenbuf > spacelen ) {
-                                       fwrite(buf, sizeof(char), lenbuf, out);
-                                       state=INTXT;
+                                       pushout( out, Paragraph, 0 );
+                                       state=WAITAFTERRED;
                                }
                        } else if ( ch=='\n' ) {
-                               buf[ lenbuf ] = ch;
+                               pushout( out, NewLine, 0 );
+                               pushout( out, NewLine, 0 );
                                lenbuf++;
-                               fwrite(buf, sizeof(char), lenbuf, out);
+                       } else {
                                state=INTXT;
-                       } else if ( ch !='\r' ) {
+                               pushout(out, Char, ' ');
+                               ungetc(ch,in);
+                       }
+               } else if ( state==WAITAFTERRED ) {
+                       if ( !isspace(ch) ) {
+                               ungetc(ch,in);
                                state=INTXT;
-                               fputc(' ',out);
+                       }
+               } else if ( state==COMMENTBEGIN1 ) {
+                       if ( ch == '-' ) {
+                               state = COMMENTBEGIN2;
+                       } else {
+                               pushoutstr(out, "<!", 2);
+                               ungetc(ch,in);
+                               state=INTXT;
+                       }
+               } else if ( state==COMMENTBEGIN2 ) {
+                       if ( ch == '-' ) {
+                               state = COMMENTIN;
+                       } else {
+                               pushoutstr(out, "<!-", 2);
+                               ungetc(ch,in);
+                               state=INTXT;
+                       }
+               } else if ( state==COMMENTIN ) {
+                       if ( ch == '-' ) 
+                               state = COMMENTEND1;
+               } else if ( state==COMMENTEND1 ) {
+                       state = ( ch == '-' ) ? COMMENTEND2 : COMMENTIN;
+               } else if ( state==COMMENTEND2 ) {
+                       if ( ch == '>' )
+                               state = INTXT;
+                       else if ( ch != '-' )
+                               state = COMMENTIN;
+               } else if ( state==INQUOTE ) {
+                       if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) {
+                               buf[ lenbuf ] = ch;
+                               lenbuf++;
+                       } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) {
+                               state = INTXT;
+                       } else {
+                               pushoutstr(out, buf, lenbuf);
+                               state = INTXT;
                                ungetc(ch,in);
-                       } 
+                       }
                } else {
                        printf("Unknown state: %d\n", state);
                        exit(1);
                }
-       } 
+       }
                        
        if ( in!=stdin )
                fclose(in);
diff --git a/expected/test.htm b/expected/test.htm
new file mode 100644 (file)
index 0000000..a74e6a9
--- /dev/null
@@ -0,0 +1,23 @@
+WOW 
+
+NOOO 
+
+YES
+  asdasd sdasdasas asd asd asd asdfaadfaaaa aaaasdfaaasdf a asdf asdfa asdfasdf
+
+  fsdfsd sdfsdffd sdf sdf sdfsdfssdsd sdfsdfs sdf sdf sd fsdfsfsfdee BOLD feseseee se
+  fsdfsd sdfsdffd sdf sdf sdfsdfssdsd sdfsdfs sdf sdf sd fsdfsfsfdee BOLDITALIC feseseee se
+
+  fsdfsd sdfsdffd sdf sdf sd fsdfsfsfdee BOLDITALIC feseseee se
+
+  ìÀÄÉ, ËÏÔÏÒÙÅ  ÎÉËÏÇÄÁ  ÎÅ  ÓÞÉÔÁÌÉ  ÓÅÂÑ  ÇÅÒÏÑÍÉ,  ÎÁÞÁÌÉ  --  ËÁË  × ÏÄÉÎÏÞËÕ,  ÔÁË É ÇÒÕÐÐÁÍÉ  -- ÒÁÚÍÙÛÌÑÔØ,  ËÁË  Ó×ÅÒÇÎÕÔØ  ÔÉÒÁÎÁ. ÷ ×ÏÚÄÕÈÅ ×ÉÔÁÌÉ  ÇÌÕÈÉÅ  ÕÇÒÏÚÙ. èÏÔÑ ÚÁÇÏ×ÏÒÝÉËÉ ÒÅÚËÏ ÒÁÚÌÉÞÁÌÉÓØ ÐÏ  ÒÏÄÕ ÚÁÎÑÔÉÊ, ÏÂÒÁÚÏ×ÁÎÉÀ  É  ÉÎÔÅÌÌÅËÔÕÁÌØÎÙÍ  ×ÏÚÍÏÖÎÏÓÔÑÍ, ÉÈ  ÏÂÝÁÑ  ÍÁÓÓÁ ÐÒÁËÔÉÞÅÓËÉ ÇÁÒÁÎÔÉÒÏ×ÁÌÁ ÉÚÂÁ×ÌÅÎÉÅ  ËÏÒÏÌÅ×ÓÔ×Á ÏÔ ÞÅÌÏ×ÅËÁ, ÖÉÒÅÀÝÅÇÏ  ÎÁ  ÓÔÒÁÄÁÎÉÑÈ ÐÏÄÄÁÎÎÙÈ... ÞÅÌÏ×ÅËÁ, ËÏÔÏÒÏÇÏ ÏÎÉ ÎÁÚÙ×ÁÌÉ ÷ÅÌÉËÉÊ óËÉ×.
+
+çìá÷á ðåò÷áñ 
+
+  åÓÌÉ É ÅÓÔØ ÎÅÞÔÏ,  ÞÅÍÕ  ÎÅÌØÚÑ ÏÂÕÞÉÔØÓÑ, ×ÙÓÔÕÐÁÑ × ËÁÞÅÓÔ×Å ËÒÕÔÏÇÏ ÂÒÁÔËÁ, ÔÁË ÜÔÏ ÉÓËÕÓÓÔ×Ï ÐÒÏ×ÅÄÅÎÉÑ ÄÅÌÏ×ÙÈ ÓÏ×ÅÝÁÎÉÊ.
+  ÷ ÐÒÏÉÚ×ÏÄÓÔ×ÅÎÎÙÈ  ÓÏÂÒÁÎÉÑÈ  (ÍÙ  ÉÈ ÎÁÚÙ×ÁÅÍ  ÒÁÚÂÏÒËÁÍÉ)  óÉÎÄÉËÁÔÁ ÏÂÙÞÎÏ  ÕÞÁÓÔ×ÕÅÔ   ÏÞÅÎØ  ÍÁÌÏ  ÌÀÄÅÊ  (ÞÔÏÂÙ   Ó×ÅÓÔÉ   Ë  ÍÉÎÉÍÕÍÕ  ÞÉÓÌÏ ÐÏÔÅÎÃÉÁÌØÎÙÈ  Ó×ÉÄÅÔÅÌÅÊ),  Á ÐÏ×ÅÓÔËÁ  ÄÎÑ,  ËÁË  ÐÒÁ×ÉÌÏ,  ÏÇÒÁÎÉÞÉ×ÁÅÔÓÑ ÒÁÓÓËÁÚÏÍ Ï ×ÏÚÎÉËÛÅÊ ÐÒÏÂÌÅÍÅ, Ó ÍÉÎÉÍÁÌØÎÏ  ×ÏÚÍÏÖÎÙÍ ËÏÌÉÞÅÓÔ×ÏÍ ÄÅÔÁÌÅÊ. úÁËÁÎÞÉ×ÁÅÔÓÑ ÄÅÌÏ×ÁÑ ×ÓÔÒÅÞÁ ÏÞÅÎØ ÐÒÏÓÔÙÍ ÒÅÛÅÎÉÅÍ: "òÁÚÂÅÒ
+
+  
+
+&qwe; & &amp;   <a href="http://com">
+
diff --git a/tests/test.htm b/tests/test.htm
new file mode 100644 (file)
index 0000000..69cb0d8
--- /dev/null
@@ -0,0 +1,65 @@
+<html>
+<head>
+<title>qq</title>
+</head>
+<body>
+WOW 
+
+<script language="javascript">
+<!--
+var EH;
+//-->
+</script>
+
+NOOO
+<select><option value="QW"></select>
+
+YES
+          asdasd
+sdasdasas asd asd
+asd asdfaadfaaaa aaaasdfaaasdf a
+asdf asdfa asdfasdf
+
+
+
+      fsdfsd sdfsdffd sdf
+sdf sdfsdfssdsd sdfsdfs sdf
+sdf sd fsdfsfsfdee <b>BOLD</b> feseseee se
+      fsdfsd sdfsdffd sdf
+sdf sdfsdfssdsd sdfsdfs sdf
+sdf sd fsdfsfsfdee <b><i>BOLDITALIC</I></b> feseseee se
+
+
+
+
+
+
+
+      fsdfsd sdfsdffd sdf
+sdf sd fsdfsfsfdee <b><i>BOLDITALIC</I></b> feseseee se
+
+
+     ìÀÄÉ, ËÏÔÏÒÙÅ  ÎÉËÏÇÄÁ  ÎÅ  ÓÞÉÔÁÌÉ  ÓÅÂÑ  ÇÅÒÏÑÍÉ,  ÎÁÞÁÌÉ  --  ËÁË  ×
+ÏÄÉÎÏÞËÕ,  ÔÁË É ÇÒÕÐÐÁÍÉ  -- ÒÁÚÍÙÛÌÑÔØ,  ËÁË  Ó×ÅÒÇÎÕÔØ  ÔÉÒÁÎÁ. ÷ ×ÏÚÄÕÈÅ
+×ÉÔÁÌÉ  ÇÌÕÈÉÅ  ÕÇÒÏÚÙ. èÏÔÑ ÚÁÇÏ×ÏÒÝÉËÉ ÒÅÚËÏ ÒÁÚÌÉÞÁÌÉÓØ ÐÏ  ÒÏÄÕ ÚÁÎÑÔÉÊ,
+ÏÂÒÁÚÏ×ÁÎÉÀ  É  ÉÎÔÅÌÌÅËÔÕÁÌØÎÙÍ  ×ÏÚÍÏÖÎÏÓÔÑÍ, ÉÈ  ÏÂÝÁÑ  ÍÁÓÓÁ ÐÒÁËÔÉÞÅÓËÉ
+ÇÁÒÁÎÔÉÒÏ×ÁÌÁ ÉÚÂÁ×ÌÅÎÉÅ  ËÏÒÏÌÅ×ÓÔ×Á ÏÔ ÞÅÌÏ×ÅËÁ, ÖÉÒÅÀÝÅÇÏ  ÎÁ  ÓÔÒÁÄÁÎÉÑÈ
+ÐÏÄÄÁÎÎÙÈ... ÞÅÌÏ×ÅËÁ, ËÏÔÏÒÏÇÏ ÏÎÉ ÎÁÚÙ×ÁÌÉ ÷ÅÌÉËÉÊ óËÉ×.
+
+<ul><a name=3></a><h2>çìá÷á ðåò÷áñ</h2></ul>
+
+     åÓÌÉ É ÅÓÔØ ÎÅÞÔÏ,  ÞÅÍÕ  ÎÅÌØÚÑ ÏÂÕÞÉÔØÓÑ, ×ÙÓÔÕÐÁÑ × ËÁÞÅÓÔ×Å ËÒÕÔÏÇÏ
+ÂÒÁÔËÁ, ÔÁË ÜÔÏ ÉÓËÕÓÓÔ×Ï ÐÒÏ×ÅÄÅÎÉÑ ÄÅÌÏ×ÙÈ ÓÏ×ÅÝÁÎÉÊ.
+     ÷ ÐÒÏÉÚ×ÏÄÓÔ×ÅÎÎÙÈ  ÓÏÂÒÁÎÉÑÈ  (ÍÙ  ÉÈ ÎÁÚÙ×ÁÅÍ  ÒÁÚÂÏÒËÁÍÉ)  óÉÎÄÉËÁÔÁ
+ÏÂÙÞÎÏ  ÕÞÁÓÔ×ÕÅÔ   ÏÞÅÎØ  ÍÁÌÏ  ÌÀÄÅÊ  (ÞÔÏÂÙ   Ó×ÅÓÔÉ   Ë  ÍÉÎÉÍÕÍÕ  ÞÉÓÌÏ
+ÐÏÔÅÎÃÉÁÌØÎÙÈ  Ó×ÉÄÅÔÅÌÅÊ),  Á ÐÏ×ÅÓÔËÁ  ÄÎÑ,  ËÁË  ÐÒÁ×ÉÌÏ,  ÏÇÒÁÎÉÞÉ×ÁÅÔÓÑ
+ÒÁÓÓËÁÚÏÍ Ï ×ÏÚÎÉËÛÅÊ ÐÒÏÂÌÅÍÅ, Ó ÍÉÎÉÍÁÌØÎÏ  ×ÏÚÍÏÖÎÙÍ ËÏÌÉÞÅÓÔ×ÏÍ ÄÅÔÁÌÅÊ.
+úÁËÁÎÞÉ×ÁÅÔÓÑ ÄÅÌÏ×ÁÑ ×ÓÔÒÅÞÁ ÏÞÅÎØ ÐÒÏÓÔÙÍ ÒÅÛÅÎÉÅÍ: "òÁÚÂÅÒ
+
+<!-- <!---  --->  <!-- as ->? -->
+
+&qwe; &amp; &amp;amp;   &lt;a href=&quot;http://com&quot;&gt;
+
+
+</body>
+</html>