From c5cbb3cfe4b2192517ea6032c982e34db1d292b9 Mon Sep 17 00:00:00 2001 From: teodor Date: Thu, 4 Aug 2005 18:11:00 +0000 Subject: [PATCH] 1 Added simple tests 2 Add removing comment, style etc 3 Improve formatting 4 Add quot transformation --- Makefile | 13 +++ clrlibru.c | 214 ++++++++++++++++++++++++++++++++++++++++------ expected/test.htm | 23 +++++ tests/test.htm | 65 ++++++++++++++ 4 files changed, 288 insertions(+), 27 deletions(-) create mode 100644 expected/test.htm create mode 100644 tests/test.htm diff --git a/Makefile b/Makefile index bf32e85..04e3721 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,19 @@ all: clrlibru clrlibru: $(OBJS) $(CC) $(LIB) -o $@ $(OBJS) +test: all + @[ -d results ] || mkdir results + @[ -d diffs ] || mkdir diffs + @for FILE in test.htm ; do \ + echo -n $$FILE " ........ " ; \ + if ./clrlibru < tests/$$FILE > results/$$FILE && diff -c expected/$$FILE results/$$FILE > diffs/$$FILE ; then \ + echo ok ; \ + else \ + echo FAILED ; \ + fi ; \ + done + clean: rm -rf clrlibru *core *.o + rm -rf results diffs diff --git a/clrlibru.c b/clrlibru.c index 825cb65..76b44f9 100644 --- a/clrlibru.c +++ b/clrlibru.c @@ -32,6 +32,8 @@ #include #include +static void pushoutstr(FILE *out, char *buf, int len); + static void usage() { printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n"); @@ -41,12 +43,15 @@ usage() { exit(0); } -char* RemoveTag[]={ +static char* RemoveTag[]={ "select", "head", "div", "a", "form", + "script", + "style", + "title", NULL }; @@ -63,6 +68,43 @@ is_rtag(char *tag, int len) { return 0; } +typedef struct { + char *quote; + char *str; + int len; +} Quote; + +static Quote quoteChange[] = { + {"quot", "\"", -1}, + {"laquo", "\"", -1}, + {"raquo", "\"", -1}, + {"quot", "\"", -1}, + {"lt", "<", -1}, + {"gt", ">", -1}, + {"nbsp", " ", -1}, + {"mdash", "-", -1}, + {"amp", "&", -1}, + {"shy", "", -1}, + {NULL, NULL, -1} +}; + +static int +pushoutquot(FILE *out, char *buf, int buflen) { + Quote *ptr = quoteChange; + + buf[buflen]='\0'; + while( ptr->quote ) { + if ( strcmp( ptr->quote, buf ) == 0 ) { + if ( ptr->len < 0 ) + ptr->len = strlen( ptr->str ); + pushoutstr( out, ptr->str, ptr->len ); + return 1; + } + ptr++; + } + return 0; +} + static char *optarg = NULL; static int current=1; @@ -98,21 +140,85 @@ mgetopt(int argn, char* argv[], char *option) { return (int)key; } -#define INTXT 0 -#define INTAG 1 -#define FINDEND 2 +#define INTXT 0 +#define INTAG 1 +#define FINDEND 2 #define INDROPTAG 3 #define INDROPINTAG 4 #define INHEADTAG 5 #define INDROPINCLSTAG 6 +#define WAITAFTERRED 7 +#define COMMENTBEGIN1 8 +#define COMMENTBEGIN2 9 +#define COMMENTIN 10 +#define COMMENTEND1 11 +#define COMMENTEND2 12 +#define INQUOTE 13 +#define BUFFERLENGTH 8192 +#define REDSTRING " " + +typedef enum TypeOut { + Char, + NewLine, + Tag, + Paragraph, + None +} TypeOut; + + +static void +pushout( FILE *out, TypeOut type, int value ) { + static TypeOut PrevType=None; + static int prevvalue=0; + static int newlinecount=0; + + if ( type == Char ) { + if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { + newlinecount=0; + fputc(value ,out); + } + prevvalue = value; + } else if ( type == NewLine ) { + if ( newlinecount < 2 ) + fputc('\n', out); + newlinecount++; + } else if ( type != PrevType ) { + switch(type) { + case Tag: + if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n' ) )) ) { + newlinecount=0; + fputc(' ', out); + } + break; + case Paragraph: + pushout(out, NewLine, 0); + fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); + newlinecount=0; + break; + default: + printf("Unknown type: %d", type); + exit(1); + } + } + PrevType = type; +} + +static void +pushoutstr(FILE *out, char *buf, int len) { + char *ptr=buf; + while( ptr-buf= 0\n"); + if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) { + printf("-l should be >= 0 and < %d\n", BUFFERLENGTH); exit(1); } break; @@ -145,6 +251,9 @@ main(int argn, char *argv[]) { } while( (ch=getc(in)) != EOF ) { + if ( ch == '\r' ) + continue; + if ( state==INTXT ) { if ( ch == '<' ) { state=INHEADTAG; @@ -152,23 +261,35 @@ main(int argn, char *argv[]) { } else if ( ch == '\n' ) { state=FINDEND; lenbuf=1; - *buf = ch; - } else if ( ch != '\r' ) - fputc(ch,out); + } else if ( ch=='&' ) { + *buf='&'; + lenbuf=1; + state=INQUOTE; + } else { + pushout(out, Char, ch); + } } else if ( state==INHEADTAG ) { if ( isalpha(ch) ) { - buf[ lenbuf ] = tolower(ch); - lenbuf++; + if ( lenbuf < BUFFERLENGTH-1 ) { + buf[ lenbuf ] = tolower(ch); + lenbuf++; + } + } else if ( ch == '!' ) { + state = COMMENTBEGIN1; } else if ( ch == '>' ) { if ( is_rtag(buf,lenbuf) ) { state = INDROPTAG; closelen=0; } else { state=INTXT; - fputc(' ',out); + if ( lenbuf==0 ) + pushoutstr(out, "<>", 2); + else + pushout(out, Tag, 0); } } else if ( lenbuf == 0 && ch != '/' ) { - fputc('<',out); fputc(ch,out); + pushout(out, Char, '<'); + pushout(out, Char, ch); state=INTXT; } else { if ( is_rtag(buf,lenbuf) ) { @@ -176,14 +297,13 @@ main(int argn, char *argv[]) { closelen=0; } else { state=INTAG; - fputc(' ',out); } } } else if ( state==INTAG ) { if ( ch == '>' ) { state=INTXT; - fputc(' ',out); - } + pushout(out, Tag, 0); + } } else if ( state == INDROPTAG ) { if ( ch == '<' ) { state=INDROPINTAG; @@ -205,28 +325,68 @@ main(int argn, char *argv[]) { } else state=INDROPTAG; } else if ( state==FINDEND ) { - if ( ch == ' ' ) { - buf[ lenbuf ] = ch; + if ( ch == ' ' || ch == '\t' ) { lenbuf++; if ( lenbuf > spacelen ) { - fwrite(buf, sizeof(char), lenbuf, out); - state=INTXT; + pushout( out, Paragraph, 0 ); + state=WAITAFTERRED; } } else if ( ch=='\n' ) { - buf[ lenbuf ] = ch; + pushout( out, NewLine, 0 ); + pushout( out, NewLine, 0 ); lenbuf++; - fwrite(buf, sizeof(char), lenbuf, out); + } else { state=INTXT; - } else if ( ch !='\r' ) { + pushout(out, Char, ' '); + ungetc(ch,in); + } + } else if ( state==WAITAFTERRED ) { + if ( !isspace(ch) ) { + ungetc(ch,in); state=INTXT; - fputc(' ',out); + } + } else if ( state==COMMENTBEGIN1 ) { + if ( ch == '-' ) { + state = COMMENTBEGIN2; + } else { + pushoutstr(out, "' ) + state = INTXT; + else if ( ch != '-' ) + state = COMMENTIN; + } else if ( state==INQUOTE ) { + if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) { + buf[ lenbuf ] = ch; + lenbuf++; + } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) { + state = INTXT; + } else { + pushoutstr(out, buf, lenbuf); + state = INTXT; ungetc(ch,in); - } + } } else { printf("Unknown state: %d\n", state); exit(1); } - } + } if ( in!=stdin ) fclose(in); diff --git a/expected/test.htm b/expected/test.htm new file mode 100644 index 0000000..a74e6a9 --- /dev/null +++ b/expected/test.htm @@ -0,0 +1,23 @@ +WOW + +NOOO + +YES + asdasd sdasdasas asd asd asd asdfaadfaaaa aaaasdfaaasdf a asdf asdfa asdfasdf + + fsdfsd sdfsdffd sdf sdf sdfsdfssdsd sdfsdfs sdf sdf sd fsdfsfsfdee BOLD feseseee se + fsdfsd sdfsdffd sdf sdf sdfsdfssdsd sdfsdfs sdf sdf sd fsdfsfsfdee BOLDITALIC feseseee se + + fsdfsd sdfsdffd sdf sdf sd fsdfsfsfdee BOLDITALIC feseseee se + + ìÀÄÉ, ËÏÔÏÒÙÅ ÎÉËÏÇÄÁ ÎÅ ÓÞÉÔÁÌÉ ÓÅÂÑ ÇÅÒÏÑÍÉ, ÎÁÞÁÌÉ -- ËÁË × ÏÄÉÎÏÞËÕ, ÔÁË É ÇÒÕÐÐÁÍÉ -- ÒÁÚÍÙÛÌÑÔØ, ËÁË Ó×ÅÒÇÎÕÔØ ÔÉÒÁÎÁ. ÷ ×ÏÚÄÕÈÅ ×ÉÔÁÌÉ ÇÌÕÈÉÅ ÕÇÒÏÚÙ. èÏÔÑ ÚÁÇÏ×ÏÒÝÉËÉ ÒÅÚËÏ ÒÁÚÌÉÞÁÌÉÓØ ÐÏ ÒÏÄÕ ÚÁÎÑÔÉÊ, ÏÂÒÁÚÏ×ÁÎÉÀ É ÉÎÔÅÌÌÅËÔÕÁÌØÎÙÍ ×ÏÚÍÏÖÎÏÓÔÑÍ, ÉÈ ÏÂÝÁÑ ÍÁÓÓÁ ÐÒÁËÔÉÞÅÓËÉ ÇÁÒÁÎÔÉÒÏ×ÁÌÁ ÉÚÂÁ×ÌÅÎÉÅ ËÏÒÏÌÅ×ÓÔ×Á ÏÔ ÞÅÌÏ×ÅËÁ, ÖÉÒÅÀÝÅÇÏ ÎÁ ÓÔÒÁÄÁÎÉÑÈ ÐÏÄÄÁÎÎÙÈ... ÞÅÌÏ×ÅËÁ, ËÏÔÏÒÏÇÏ ÏÎÉ ÎÁÚÙ×ÁÌÉ ÷ÅÌÉËÉÊ óËÉ×. + +çìá÷á ðåò÷áñ + + åÓÌÉ É ÅÓÔØ ÎÅÞÔÏ, ÞÅÍÕ ÎÅÌØÚÑ ÏÂÕÞÉÔØÓÑ, ×ÙÓÔÕÐÁÑ × ËÁÞÅÓÔ×Å ËÒÕÔÏÇÏ ÂÒÁÔËÁ, ÔÁË ÜÔÏ ÉÓËÕÓÓÔ×Ï ÐÒÏ×ÅÄÅÎÉÑ ÄÅÌÏ×ÙÈ ÓÏ×ÅÝÁÎÉÊ. + ÷ ÐÒÏÉÚ×ÏÄÓÔ×ÅÎÎÙÈ ÓÏÂÒÁÎÉÑÈ (ÍÙ ÉÈ ÎÁÚÙ×ÁÅÍ ÒÁÚÂÏÒËÁÍÉ) óÉÎÄÉËÁÔÁ ÏÂÙÞÎÏ ÕÞÁÓÔ×ÕÅÔ ÏÞÅÎØ ÍÁÌÏ ÌÀÄÅÊ (ÞÔÏÂÙ Ó×ÅÓÔÉ Ë ÍÉÎÉÍÕÍÕ ÞÉÓÌÏ ÐÏÔÅÎÃÉÁÌØÎÙÈ Ó×ÉÄÅÔÅÌÅÊ), Á ÐÏ×ÅÓÔËÁ ÄÎÑ, ËÁË ÐÒÁ×ÉÌÏ, ÏÇÒÁÎÉÞÉ×ÁÅÔÓÑ ÒÁÓÓËÁÚÏÍ Ï ×ÏÚÎÉËÛÅÊ ÐÒÏÂÌÅÍÅ, Ó ÍÉÎÉÍÁÌØÎÏ ×ÏÚÍÏÖÎÙÍ ËÏÌÉÞÅÓÔ×ÏÍ ÄÅÔÁÌÅÊ. úÁËÁÎÞÉ×ÁÅÔÓÑ ÄÅÌÏ×ÁÑ ×ÓÔÒÅÞÁ ÏÞÅÎØ ÐÒÏÓÔÙÍ ÒÅÛÅÎÉÅÍ: "òÁÚÂÅÒ + + + +&qwe; & & + diff --git a/tests/test.htm b/tests/test.htm new file mode 100644 index 0000000..69cb0d8 --- /dev/null +++ b/tests/test.htm @@ -0,0 +1,65 @@ + + +qq + + +WOW + + + +NOOO + + +YES + asdasd +sdasdasas asd asd +asd asdfaadfaaaa aaaasdfaaasdf a +asdf asdfa asdfasdf + + + + fsdfsd sdfsdffd sdf +sdf sdfsdfssdsd sdfsdfs sdf +sdf sd fsdfsfsfdee BOLD feseseee se + fsdfsd sdfsdffd sdf +sdf sdfsdfssdsd sdfsdfs sdf +sdf sd fsdfsfsfdee BOLDITALIC feseseee se + + + + + + + + fsdfsd sdfsdffd sdf +sdf sd fsdfsfsfdee BOLDITALIC feseseee se + + + ìÀÄÉ, ËÏÔÏÒÙÅ ÎÉËÏÇÄÁ ÎÅ ÓÞÉÔÁÌÉ ÓÅÂÑ ÇÅÒÏÑÍÉ, ÎÁÞÁÌÉ -- ËÁË × +ÏÄÉÎÏÞËÕ, ÔÁË É ÇÒÕÐÐÁÍÉ -- ÒÁÚÍÙÛÌÑÔØ, ËÁË Ó×ÅÒÇÎÕÔØ ÔÉÒÁÎÁ. ÷ ×ÏÚÄÕÈÅ +×ÉÔÁÌÉ ÇÌÕÈÉÅ ÕÇÒÏÚÙ. èÏÔÑ ÚÁÇÏ×ÏÒÝÉËÉ ÒÅÚËÏ ÒÁÚÌÉÞÁÌÉÓØ ÐÏ ÒÏÄÕ ÚÁÎÑÔÉÊ, +ÏÂÒÁÚÏ×ÁÎÉÀ É ÉÎÔÅÌÌÅËÔÕÁÌØÎÙÍ ×ÏÚÍÏÖÎÏÓÔÑÍ, ÉÈ ÏÂÝÁÑ ÍÁÓÓÁ ÐÒÁËÔÉÞÅÓËÉ +ÇÁÒÁÎÔÉÒÏ×ÁÌÁ ÉÚÂÁ×ÌÅÎÉÅ ËÏÒÏÌÅ×ÓÔ×Á ÏÔ ÞÅÌÏ×ÅËÁ, ÖÉÒÅÀÝÅÇÏ ÎÁ ÓÔÒÁÄÁÎÉÑÈ +ÐÏÄÄÁÎÎÙÈ... ÞÅÌÏ×ÅËÁ, ËÏÔÏÒÏÇÏ ÏÎÉ ÎÁÚÙ×ÁÌÉ ÷ÅÌÉËÉÊ óËÉ×. + +

    çìá÷á ðåò÷áñ

+ + åÓÌÉ É ÅÓÔØ ÎÅÞÔÏ, ÞÅÍÕ ÎÅÌØÚÑ ÏÂÕÞÉÔØÓÑ, ×ÙÓÔÕÐÁÑ × ËÁÞÅÓÔ×Å ËÒÕÔÏÇÏ +ÂÒÁÔËÁ, ÔÁË ÜÔÏ ÉÓËÕÓÓÔ×Ï ÐÒÏ×ÅÄÅÎÉÑ ÄÅÌÏ×ÙÈ ÓÏ×ÅÝÁÎÉÊ. + ÷ ÐÒÏÉÚ×ÏÄÓÔ×ÅÎÎÙÈ ÓÏÂÒÁÎÉÑÈ (ÍÙ ÉÈ ÎÁÚÙ×ÁÅÍ ÒÁÚÂÏÒËÁÍÉ) óÉÎÄÉËÁÔÁ +ÏÂÙÞÎÏ ÕÞÁÓÔ×ÕÅÔ ÏÞÅÎØ ÍÁÌÏ ÌÀÄÅÊ (ÞÔÏÂÙ Ó×ÅÓÔÉ Ë ÍÉÎÉÍÕÍÕ ÞÉÓÌÏ +ÐÏÔÅÎÃÉÁÌØÎÙÈ Ó×ÉÄÅÔÅÌÅÊ), Á ÐÏ×ÅÓÔËÁ ÄÎÑ, ËÁË ÐÒÁ×ÉÌÏ, ÏÇÒÁÎÉÞÉ×ÁÅÔÓÑ +ÒÁÓÓËÁÚÏÍ Ï ×ÏÚÎÉËÛÅÊ ÐÒÏÂÌÅÍÅ, Ó ÍÉÎÉÍÁÌØÎÏ ×ÏÚÍÏÖÎÙÍ ËÏÌÉÞÅÓÔ×ÏÍ ÄÅÔÁÌÅÊ. +úÁËÁÎÞÉ×ÁÅÔÓÑ ÄÅÌÏ×ÁÑ ×ÓÔÒÅÞÁ ÏÞÅÎØ ÐÒÏÓÔÙÍ ÒÅÛÅÎÉÅÍ: "òÁÚÂÅÒ + + + +&qwe; & &amp; <a href="http://com"> + + + + -- 2.37.3