#include <string.h>
#include <ctype.h>
+static void pushoutstr(FILE *out, char *buf, int len);
+
static void
usage() {
printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n");
exit(0);
}
-char* RemoveTag[]={
+static char* RemoveTag[]={
"select",
"head",
"div",
"a",
"form",
+ "script",
+ "style",
+ "title",
NULL
};
return 0;
}
+typedef struct {
+ char *quote;
+ char *str;
+ int len;
+} Quote;
+
+static Quote quoteChange[] = {
+ {"quot", "\"", -1},
+ {"laquo", "\"", -1},
+ {"raquo", "\"", -1},
+ {"quot", "\"", -1},
+ {"lt", "<", -1},
+ {"gt", ">", -1},
+ {"nbsp", " ", -1},
+ {"mdash", "-", -1},
+ {"amp", "&", -1},
+ {"shy", "", -1},
+ {NULL, NULL, -1}
+};
+
+static int
+pushoutquot(FILE *out, char *buf, int buflen) {
+ Quote *ptr = quoteChange;
+
+ buf[buflen]='\0';
+ while( ptr->quote ) {
+ if ( strcmp( ptr->quote, buf ) == 0 ) {
+ if ( ptr->len < 0 )
+ ptr->len = strlen( ptr->str );
+ pushoutstr( out, ptr->str, ptr->len );
+ return 1;
+ }
+ ptr++;
+ }
+ return 0;
+}
+
static char *optarg = NULL;
static int current=1;
return (int)key;
}
-#define INTXT 0
-#define INTAG 1
-#define FINDEND 2
+#define INTXT 0
+#define INTAG 1
+#define FINDEND 2
#define INDROPTAG 3
#define INDROPINTAG 4
#define INHEADTAG 5
#define INDROPINCLSTAG 6
+#define WAITAFTERRED 7
+#define COMMENTBEGIN1 8
+#define COMMENTBEGIN2 9
+#define COMMENTIN 10
+#define COMMENTEND1 11
+#define COMMENTEND2 12
+#define INQUOTE 13
+#define BUFFERLENGTH 8192
+#define REDSTRING " "
+
+typedef enum TypeOut {
+ Char,
+ NewLine,
+ Tag,
+ Paragraph,
+ None
+} TypeOut;
+
+
+static void
+pushout( FILE *out, TypeOut type, int value ) {
+ static TypeOut PrevType=None;
+ static int prevvalue=0;
+ static int newlinecount=0;
+
+ if ( type == Char ) {
+ if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) {
+ newlinecount=0;
+ fputc(value ,out);
+ }
+ prevvalue = value;
+ } else if ( type == NewLine ) {
+ if ( newlinecount < 2 )
+ fputc('\n', out);
+ newlinecount++;
+ } else if ( type != PrevType ) {
+ switch(type) {
+ case Tag:
+ if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n' ) )) ) {
+ newlinecount=0;
+ fputc(' ', out);
+ }
+ break;
+ case Paragraph:
+ pushout(out, NewLine, 0);
+ fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out);
+ newlinecount=0;
+ break;
+ default:
+ printf("Unknown type: %d", type);
+ exit(1);
+ }
+ }
+ PrevType = type;
+}
+
+static void
+pushoutstr(FILE *out, char *buf, int len) {
+ char *ptr=buf;
+ while( ptr-buf<len ) {
+ pushout(out, Char, (int)(*ptr));
+ ptr++;
+ }
+}
int
main(int argn, char *argv[]) {
int ch;
FILE *in=stdin, *out=stdout;
int state=INTXT;
- char buf[8192];
+ char buf[BUFFERLENGTH];
int lenbuf=0,closelen=0;
int spacelen=4;
break;
case 'l':
spacelen = atoi(optarg);
- if ( spacelen < 0 ) {
- printf("-l should be >= 0\n");
+ if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) {
+ printf("-l should be >= 0 and < %d\n", BUFFERLENGTH);
exit(1);
}
break;
}
while( (ch=getc(in)) != EOF ) {
+ if ( ch == '\r' )
+ continue;
+
if ( state==INTXT ) {
if ( ch == '<' ) {
state=INHEADTAG;
} else if ( ch == '\n' ) {
state=FINDEND;
lenbuf=1;
- *buf = ch;
- } else if ( ch != '\r' )
- fputc(ch,out);
+ } else if ( ch=='&' ) {
+ *buf='&';
+ lenbuf=1;
+ state=INQUOTE;
+ } else {
+ pushout(out, Char, ch);
+ }
} else if ( state==INHEADTAG ) {
if ( isalpha(ch) ) {
- buf[ lenbuf ] = tolower(ch);
- lenbuf++;
+ if ( lenbuf < BUFFERLENGTH-1 ) {
+ buf[ lenbuf ] = tolower(ch);
+ lenbuf++;
+ }
+ } else if ( ch == '!' ) {
+ state = COMMENTBEGIN1;
} else if ( ch == '>' ) {
if ( is_rtag(buf,lenbuf) ) {
state = INDROPTAG;
closelen=0;
} else {
state=INTXT;
- fputc(' ',out);
+ if ( lenbuf==0 )
+ pushoutstr(out, "<>", 2);
+ else
+ pushout(out, Tag, 0);
}
} else if ( lenbuf == 0 && ch != '/' ) {
- fputc('<',out); fputc(ch,out);
+ pushout(out, Char, '<');
+ pushout(out, Char, ch);
state=INTXT;
} else {
if ( is_rtag(buf,lenbuf) ) {
closelen=0;
} else {
state=INTAG;
- fputc(' ',out);
}
}
} else if ( state==INTAG ) {
if ( ch == '>' ) {
state=INTXT;
- fputc(' ',out);
- }
+ pushout(out, Tag, 0);
+ }
} else if ( state == INDROPTAG ) {
if ( ch == '<' ) {
state=INDROPINTAG;
} else
state=INDROPTAG;
} else if ( state==FINDEND ) {
- if ( ch == ' ' ) {
- buf[ lenbuf ] = ch;
+ if ( ch == ' ' || ch == '\t' ) {
lenbuf++;
if ( lenbuf > spacelen ) {
- fwrite(buf, sizeof(char), lenbuf, out);
- state=INTXT;
+ pushout( out, Paragraph, 0 );
+ state=WAITAFTERRED;
}
} else if ( ch=='\n' ) {
- buf[ lenbuf ] = ch;
+ pushout( out, NewLine, 0 );
+ pushout( out, NewLine, 0 );
lenbuf++;
- fwrite(buf, sizeof(char), lenbuf, out);
+ } else {
state=INTXT;
- } else if ( ch !='\r' ) {
+ pushout(out, Char, ' ');
+ ungetc(ch,in);
+ }
+ } else if ( state==WAITAFTERRED ) {
+ if ( !isspace(ch) ) {
+ ungetc(ch,in);
state=INTXT;
- fputc(' ',out);
+ }
+ } else if ( state==COMMENTBEGIN1 ) {
+ if ( ch == '-' ) {
+ state = COMMENTBEGIN2;
+ } else {
+ pushoutstr(out, "<!", 2);
+ ungetc(ch,in);
+ state=INTXT;
+ }
+ } else if ( state==COMMENTBEGIN2 ) {
+ if ( ch == '-' ) {
+ state = COMMENTIN;
+ } else {
+ pushoutstr(out, "<!-", 2);
+ ungetc(ch,in);
+ state=INTXT;
+ }
+ } else if ( state==COMMENTIN ) {
+ if ( ch == '-' )
+ state = COMMENTEND1;
+ } else if ( state==COMMENTEND1 ) {
+ state = ( ch == '-' ) ? COMMENTEND2 : COMMENTIN;
+ } else if ( state==COMMENTEND2 ) {
+ if ( ch == '>' )
+ state = INTXT;
+ else if ( ch != '-' )
+ state = COMMENTIN;
+ } else if ( state==INQUOTE ) {
+ if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) {
+ buf[ lenbuf ] = ch;
+ lenbuf++;
+ } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) {
+ state = INTXT;
+ } else {
+ pushoutstr(out, buf, lenbuf);
+ state = INTXT;
ungetc(ch,in);
- }
+ }
} else {
printf("Unknown state: %d\n", state);
exit(1);
}
- }
+ }
if ( in!=stdin )
fclose(in);
--- /dev/null
+<html>
+<head>
+<title>qq</title>
+</head>
+<body>
+WOW
+
+<script language="javascript">
+<!--
+var EH;
+//-->
+</script>
+
+NOOO
+<select><option value="QW"></select>
+
+YES
+ asdasd
+sdasdasas asd asd
+asd asdfaadfaaaa aaaasdfaaasdf a
+asdf asdfa asdfasdf
+
+
+
+ fsdfsd sdfsdffd sdf
+sdf sdfsdfssdsd sdfsdfs sdf
+sdf sd fsdfsfsfdee <b>BOLD</b> feseseee se
+ fsdfsd sdfsdffd sdf
+sdf sdfsdfssdsd sdfsdfs sdf
+sdf sd fsdfsfsfdee <b><i>BOLDITALIC</I></b> feseseee se
+
+
+
+
+
+
+
+ fsdfsd sdfsdffd sdf
+sdf sd fsdfsfsfdee <b><i>BOLDITALIC</I></b> feseseee se
+
+
+ ìÀÄÉ, ËÏÔÏÒÙÅ ÎÉËÏÇÄÁ ÎÅ ÓÞÉÔÁÌÉ ÓÅÂÑ ÇÅÒÏÑÍÉ, ÎÁÞÁÌÉ -- ËÁË ×
+ÏÄÉÎÏÞËÕ, ÔÁË É ÇÒÕÐÐÁÍÉ -- ÒÁÚÍÙÛÌÑÔØ, ËÁË Ó×ÅÒÇÎÕÔØ ÔÉÒÁÎÁ. ÷ ×ÏÚÄÕÈÅ
+×ÉÔÁÌÉ ÇÌÕÈÉÅ ÕÇÒÏÚÙ. èÏÔÑ ÚÁÇÏ×ÏÒÝÉËÉ ÒÅÚËÏ ÒÁÚÌÉÞÁÌÉÓØ ÐÏ ÒÏÄÕ ÚÁÎÑÔÉÊ,
+ÏÂÒÁÚÏ×ÁÎÉÀ É ÉÎÔÅÌÌÅËÔÕÁÌØÎÙÍ ×ÏÚÍÏÖÎÏÓÔÑÍ, ÉÈ ÏÂÝÁÑ ÍÁÓÓÁ ÐÒÁËÔÉÞÅÓËÉ
+ÇÁÒÁÎÔÉÒÏ×ÁÌÁ ÉÚÂÁ×ÌÅÎÉÅ ËÏÒÏÌÅ×ÓÔ×Á ÏÔ ÞÅÌÏ×ÅËÁ, ÖÉÒÅÀÝÅÇÏ ÎÁ ÓÔÒÁÄÁÎÉÑÈ
+ÐÏÄÄÁÎÎÙÈ... ÞÅÌÏ×ÅËÁ, ËÏÔÏÒÏÇÏ ÏÎÉ ÎÁÚÙ×ÁÌÉ ÷ÅÌÉËÉÊ óËÉ×.
+
+<ul><a name=3></a><h2>çìá÷á ðåò÷áñ</h2></ul>
+
+ åÓÌÉ É ÅÓÔØ ÎÅÞÔÏ, ÞÅÍÕ ÎÅÌØÚÑ ÏÂÕÞÉÔØÓÑ, ×ÙÓÔÕÐÁÑ × ËÁÞÅÓÔ×Å ËÒÕÔÏÇÏ
+ÂÒÁÔËÁ, ÔÁË ÜÔÏ ÉÓËÕÓÓÔ×Ï ÐÒÏ×ÅÄÅÎÉÑ ÄÅÌÏ×ÙÈ ÓÏ×ÅÝÁÎÉÊ.
+ ÷ ÐÒÏÉÚ×ÏÄÓÔ×ÅÎÎÙÈ ÓÏÂÒÁÎÉÑÈ (ÍÙ ÉÈ ÎÁÚÙ×ÁÅÍ ÒÁÚÂÏÒËÁÍÉ) óÉÎÄÉËÁÔÁ
+ÏÂÙÞÎÏ ÕÞÁÓÔ×ÕÅÔ ÏÞÅÎØ ÍÁÌÏ ÌÀÄÅÊ (ÞÔÏÂÙ Ó×ÅÓÔÉ Ë ÍÉÎÉÍÕÍÕ ÞÉÓÌÏ
+ÐÏÔÅÎÃÉÁÌØÎÙÈ Ó×ÉÄÅÔÅÌÅÊ), Á ÐÏ×ÅÓÔËÁ ÄÎÑ, ËÁË ÐÒÁ×ÉÌÏ, ÏÇÒÁÎÉÞÉ×ÁÅÔÓÑ
+ÒÁÓÓËÁÚÏÍ Ï ×ÏÚÎÉËÛÅÊ ÐÒÏÂÌÅÍÅ, Ó ÍÉÎÉÍÁÌØÎÏ ×ÏÚÍÏÖÎÙÍ ËÏÌÉÞÅÓÔ×ÏÍ ÄÅÔÁÌÅÊ.
+úÁËÁÎÞÉ×ÁÅÔÓÑ ÄÅÌÏ×ÁÑ ×ÓÔÒÅÞÁ ÏÞÅÎØ ÐÒÏÓÔÙÍ ÒÅÛÅÎÉÅÍ: "òÁÚÂÅÒ
+
+<!-- <!--- ---> <!-- as ->? -->
+
+&qwe; & &amp; <a href="http://com">
+
+
+</body>
+</html>