Initial revision
[clrlibru.git] / clrlibru.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <ctype.h>
5
6 static void
7 usage() {
8         printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n");
9         printf("Author: Teodor Sigaev <teodor@sigaev.ru>\n");
10         printf("Usage:\n   clrlibru [-i INPUTFILE] [-o OUTPUT] [-l NUMSPACE]\n");
11         exit(0);
12 }
13
14 char* RemoveTag[]={
15         "select",
16         "head",
17         "div",
18         "a",
19         "form",
20         NULL
21 };
22
23 static int
24 is_rtag(char *tag, int len) {
25         char **ptr=RemoveTag;
26
27         tag[len]='\0';
28         while( *ptr ) {
29                 if ( strcmp(tag, *ptr) == 0 ) 
30                         return 1;
31                 ptr++;
32         } 
33         return 0;
34 }
35
36
37 static char *optarg = NULL;
38 static int current=1;
39
40 int
41 mgetopt(int argn, char* argv[], char *option) {
42         char key;
43
44         if ( current >= argn ) return -1;
45
46         key = *(argv[current]+1);
47         if ( *(argv[current]) == '-' || *(argv[current]) == '/' ) {
48                 char *ptr = strchr( option, key );
49                 if ( ptr == NULL ) {
50                         printf( "Unknown option: %s\n", argv[current]);
51                         exit(1);
52                 }
53                 current++;
54                 if ( *(ptr+1) == ':' ) {
55                         if (current < argn) {
56                                 optarg=argv[current];
57                                 current++;
58                         } else {
59                                 printf("No value for -%c\n" ,key);
60                                 exit(1);
61                         }
62                 } else
63                         optarg=NULL;
64         } else {
65                 printf("Unknown option: %s\n", argv[current]);
66                 exit(1);
67         }
68         return (int)key;
69 }
70
71 #define INTXT   0
72 #define INTAG   1
73 #define FINDEND 2
74 #define INDROPTAG       3
75 #define INDROPINTAG     4
76 #define INHEADTAG       5
77 #define INDROPINCLSTAG  6
78
79
80 int 
81 main(int argn, char *argv[]) {
82         int ch;
83         FILE    *in=stdin, *out=stdout;
84         int state=INTXT;
85         char    buf[8192];
86         int lenbuf=0,closelen=0;
87         int spacelen=4;
88
89         while((ch = mgetopt(argn, argv, "l:i:o:h?"))!=-1) {
90                 switch (ch) {
91                         case 'i':
92                                 if ( (in=fopen(optarg, "r"))==NULL) {
93                                         printf("Can't open file %s\n", optarg);
94                                         exit(1);
95                                 }
96                                 break;
97                         case 'o':
98                                 if ( (out=fopen(optarg, "w"))==NULL) {
99                                         printf("Can't open file %s\n",optarg);
100                                         exit(1);
101                                 }
102                                 break;
103                         case 'l':
104                                 spacelen = atoi(optarg);
105                                 if ( spacelen < 0 ) {
106                                         printf("-l should be >= 0\n");
107                                         exit(1);
108                                 }
109                                 break;
110                         case 'h':
111                         case '?':
112                         default:
113                                 usage();
114                 }
115         }
116         
117         while( (ch=getc(in)) != EOF ) {
118                 if ( state==INTXT ) {
119                         if ( ch == '<' ) {
120                                 state=INHEADTAG;
121                                 lenbuf=0;
122                         } else if ( ch == '\n' ) {
123                                 state=FINDEND;
124                                 lenbuf=1;
125                                 *buf = ch;
126                         } else if ( ch != '\r' )
127                                 fputc(ch,out);
128                 } else if ( state==INHEADTAG ) {
129                         if ( isalpha(ch) ) {
130                                 buf[ lenbuf ] = tolower(ch);
131                                 lenbuf++;
132                         } else if ( ch == '>' ) {
133                                 if ( is_rtag(buf,lenbuf) ) {
134                                         state = INDROPTAG;
135                                         closelen=0;
136                                 } else {
137                                         state=INTXT;
138                                         fputc(' ',out);
139                                 }
140                         } else if ( lenbuf == 0 && ch != '/' ) {
141                                 fputc('<',out); fputc(ch,out); 
142                                 state=INTXT;
143                         } else { 
144                                 if ( is_rtag(buf,lenbuf) ) {
145                                         state = INDROPTAG;
146                                         closelen=0;
147                                 } else {
148                                         state=INTAG;
149                                         fputc(' ',out);
150                                 }
151                         }
152                 } else if ( state==INTAG ) {
153                         if ( ch == '>' ) {
154                                 state=INTXT;
155                                 fputc(' ',out);
156                         }       
157                 } else if ( state == INDROPTAG ) {
158                         if ( ch == '<' ) {
159                                 state=INDROPINTAG;
160                                 closelen=0;
161                         }
162                 } else if ( state == INDROPINTAG ) {
163                         if ( ch == '/' )
164                                 state=INDROPINCLSTAG;
165                         else
166                                 state=INDROPTAG;
167                 } else if ( state == INDROPINCLSTAG ) {
168                         if ( isalpha(ch) ) {
169                                 if ( closelen < lenbuf && tolower(ch) == buf[closelen] ) {
170                                         closelen++;
171                                         if ( closelen==lenbuf )
172                                                 state=INTAG;
173                                 } else 
174                                         state=INDROPTAG;
175                         } else
176                                 state=INDROPTAG;
177                 } else if ( state==FINDEND ) {
178                         if ( ch == ' ' ) {
179                                 buf[ lenbuf ] = ch;
180                                 lenbuf++;
181                                 if ( lenbuf > spacelen ) {
182                                         fwrite(buf, sizeof(char), lenbuf, out);
183                                         state=INTXT;
184                                 }
185                         } else if ( ch=='\n' ) {
186                                 buf[ lenbuf ] = ch;
187                                 lenbuf++;
188                                 fwrite(buf, sizeof(char), lenbuf, out);
189                                 state=INTXT;
190                         } else if ( ch !='\r' ) {
191                                 state=INTXT;
192                                 fputc(' ',out);
193                                 ungetc(ch,in);
194                         } 
195                 } else {
196                         printf("Unknown state: %d\n", state);
197                         exit(1);
198                 }
199         } 
200                         
201         if ( in!=stdin )
202                 fclose(in);
203         if ( out!=stdout )
204                 fclose(out);
205
206         return 0;       
207 }
208