add .gitignore
[clrlibru.git] / clrlibru.c
1 /*
2  * Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *        notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *        notice, this list of conditions and the following disclaimer in the
12  *        documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the author nor the names of any co-contributors
14  *        may be used to endorse or promote products derived from this software
15  *        without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <ctype.h>
34
35 static void pushoutstr(FILE *out, char *buf, int len);
36
37 static void
38 usage() {
39         printf("Clear text from lib.ru for Pocket PC, Version 0.2.\n");
40         printf("Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>\n");
41         printf("       All rights reserved.\n");
42         printf("Usage:\n   clrlibru [-i INPUTFILE] [-o OUTPUT] [-l NUMSPACE]\n");
43         exit(0);
44 }
45
46 static char* RemoveTag[]={
47         "select",
48         "head",
49         "div",
50         "a",
51         "form",
52         "script",
53         "style",
54         "title",
55         NULL
56 };
57
58 static int
59 is_rtag(char *tag, int len) {
60         char **ptr=RemoveTag;
61
62         tag[len]='\0';
63         while( *ptr ) {
64                 if ( strcmp(tag, *ptr) == 0 ) 
65                         return 1;
66                 ptr++;
67         } 
68         return 0;
69 }
70
71 typedef struct {
72         char    *quote;
73         char    *str;
74         int     len;
75 } Quote;
76
77 static Quote quoteChange[] = {
78         {"quot", "\"", -1},
79         {"laquo", "\"", -1},
80         {"raquo", "\"", -1},
81         {"quot", "\"", -1},
82         {"lt", "<", -1},
83         {"gt", ">", -1},
84         {"nbsp", " ", -1},
85         {"mdash", "-", -1},
86         {"amp", "&", -1},
87         {"shy", "", -1},
88         {NULL, NULL, -1}
89 }; 
90
91 static int
92 pushoutquot(FILE *out, char *buf, int buflen) {
93         Quote   *ptr = quoteChange;
94
95         buf[buflen]='\0';
96         while( ptr->quote ) {
97                 if ( strcmp( ptr->quote, buf ) == 0 ) {
98                         if ( ptr->len < 0 ) 
99                                 ptr->len = strlen( ptr->str );
100                         pushoutstr( out, ptr->str, ptr->len );
101                         return 1;
102                 }
103                 ptr++;
104         }
105         return 0;
106 }
107
108
109 static char *optarg = NULL;
110 static int current=1;
111
112 int
113 mgetopt(int argn, char* argv[], char *option) {
114         char key;
115
116         if ( current >= argn ) return -1;
117
118         key = *(argv[current]+1);
119         if ( *(argv[current]) == '-' || *(argv[current]) == '/' ) {
120                 char *ptr = strchr( option, key );
121                 if ( ptr == NULL ) {
122                         printf( "Unknown option: %s\n", argv[current]);
123                         exit(1);
124                 }
125                 current++;
126                 if ( *(ptr+1) == ':' ) {
127                         if (current < argn) {
128                                 optarg=argv[current];
129                                 current++;
130                         } else {
131                                 printf("No value for -%c\n" ,key);
132                                 exit(1);
133                         }
134                 } else
135                         optarg=NULL;
136         } else {
137                 printf("Unknown option: %s\n", argv[current]);
138                 exit(1);
139         }
140         return (int)key;
141 }
142
143 #define INTXT           0
144 #define INTAG           1
145 #define FINDEND         2
146 #define INDROPTAG       3
147 #define INDROPINTAG     4
148 #define INHEADTAG       5
149 #define INDROPINCLSTAG  6
150 #define WAITAFTERRED    7
151 #define COMMENTBEGIN1   8       
152 #define COMMENTBEGIN2   9       
153 #define COMMENTIN       10      
154 #define COMMENTEND1     11      
155 #define COMMENTEND2     12      
156 #define INQUOTE         13
157
158 #define BUFFERLENGTH    8192
159 #define REDSTRING       "  "
160
161 typedef enum TypeOut {
162         Char,
163         NewLine,
164         Tag,
165         Paragraph,
166         None
167 } TypeOut;
168
169  
170 static void
171 pushout( FILE *out, TypeOut type, int value ) {
172         static TypeOut PrevType=None;
173         static int  prevvalue=0;
174         static int newlinecount=0;
175
176         if ( type == Char ) {
177                 if ( !(( PrevType==Tag || PrevType==None || PrevType==NewLine ) && ( value == ' ' || value == '\t' )) ) { 
178                         newlinecount=0;
179                         fputc(value ,out);
180                 }
181                 prevvalue = value;
182         } else if ( type == NewLine ) {
183                 if ( newlinecount < 2 ) 
184                         fputc('\n', out);
185                 newlinecount++;
186         } else if ( type != PrevType ) {
187                 switch(type) {
188                         case Tag:
189                                 if ( !(PrevType==None || PrevType==NewLine || ( PrevType==Char && ( prevvalue == ' ' || prevvalue == '\t' || prevvalue == '\n'  ) )) ) { 
190                                         newlinecount=0;
191                                         fputc(' ', out);
192                                 }
193                                 break;
194                         case Paragraph:
195                                 pushout(out, NewLine, 0);
196                                 fwrite(REDSTRING, sizeof(char), strlen(REDSTRING), out); 
197                                 newlinecount=0;
198                                 break;
199                         default:
200                                 printf("Unknown type: %d", type);
201                                 exit(1);
202                 }
203         }
204         PrevType = type;
205 }
206
207 static void
208 pushoutstr(FILE *out, char *buf, int len) {
209         char *ptr=buf;
210         while( ptr-buf<len ) {
211                 pushout(out, Char, (int)(*ptr));
212                 ptr++;
213         } 
214 }
215
216 int 
217 main(int argn, char *argv[]) {
218         int ch;
219         FILE    *in=stdin, *out=stdout;
220         int state=INTXT;
221         char    buf[BUFFERLENGTH];
222         int lenbuf=0,closelen=0;
223         int spacelen=4;
224
225         while((ch = mgetopt(argn, argv, "l:i:o:h?"))!=-1) {
226                 switch (ch) {
227                         case 'i':
228                                 if ( (in=fopen(optarg, "r"))==NULL) {
229                                         printf("Can't open file %s\n", optarg);
230                                         exit(1);
231                                 }
232                                 break;
233                         case 'o':
234                                 if ( (out=fopen(optarg, "w"))==NULL) {
235                                         printf("Can't open file %s\n",optarg);
236                                         exit(1);
237                                 }
238                                 break;
239                         case 'l':
240                                 spacelen = atoi(optarg);
241                                 if ( spacelen < 0 || spacelen >= BUFFERLENGTH ) {
242                                         printf("-l should be >= 0 and < %d\n", BUFFERLENGTH);
243                                         exit(1);
244                                 }
245                                 break;
246                         case 'h':
247                         case '?':
248                         default:
249                                 usage();
250                 }
251         }
252         
253         while( (ch=getc(in)) != EOF ) {
254                 if ( ch == '\r' )
255                         continue;
256
257                 if ( state==INTXT ) {
258                         if ( ch == '<' ) {
259                                 state=INHEADTAG;
260                                 lenbuf=0;
261                         } else if ( ch == '\n' ) {
262                                 state=FINDEND;
263                                 lenbuf=1;
264                         } else if ( ch=='&' ) {
265                                 *buf='&';
266                                 lenbuf=1;       
267                                 state=INQUOTE;
268                         } else { 
269                                 pushout(out, Char, ch);
270                         }
271                 } else if ( state==INHEADTAG ) {
272                         if ( isalpha(ch) ) {
273                                 if ( lenbuf < BUFFERLENGTH-1 ) { 
274                                         buf[ lenbuf ] = tolower(ch);
275                                         lenbuf++;
276                                 }
277                         } else if ( ch == '!' ) {
278                                 state = COMMENTBEGIN1;
279                         } else if ( ch == '>' ) {
280                                 if ( is_rtag(buf,lenbuf) ) {
281                                         state = INDROPTAG;
282                                         closelen=0;
283                                 } else {
284                                         state=INTXT;
285                                         if ( lenbuf==0 )
286                                                 pushoutstr(out, "<>", 2);
287                                         else
288                                                 pushout(out, Tag, 0);
289                                 }
290                         } else if ( lenbuf == 0 && ch != '/' ) {
291                                 pushout(out, Char, '<');
292                                 pushout(out, Char, ch);
293                                 state=INTXT;
294                         } else { 
295                                 if ( is_rtag(buf,lenbuf) ) {
296                                         state = INDROPTAG;
297                                         closelen=0;
298                                 } else {
299                                         state=INTAG;
300                                 }
301                         }
302                 } else if ( state==INTAG ) {
303                         if ( ch == '>' ) {
304                                 state=INTXT;
305                                 pushout(out, Tag, 0);
306                         }
307                 } else if ( state == INDROPTAG ) {
308                         if ( ch == '<' ) {
309                                 state=INDROPINTAG;
310                                 closelen=0;
311                         }
312                 } else if ( state == INDROPINTAG ) {
313                         if ( ch == '/' )
314                                 state=INDROPINCLSTAG;
315                         else
316                                 state=INDROPTAG;
317                 } else if ( state == INDROPINCLSTAG ) {
318                         if ( isalpha(ch) ) {
319                                 if ( closelen < lenbuf && tolower(ch) == buf[closelen] ) {
320                                         closelen++;
321                                         if ( closelen==lenbuf )
322                                                 state=INTAG;
323                                 } else 
324                                         state=INDROPTAG;
325                         } else
326                                 state=INDROPTAG;
327                 } else if ( state==FINDEND ) {
328                         if ( ch == ' ' || ch == '\t' ) {
329                                 lenbuf++;
330                                 if ( lenbuf > spacelen ) {
331                                         pushout( out, Paragraph, 0 );
332                                         state=WAITAFTERRED;
333                                 }
334                         } else if ( ch=='\n' ) {
335                                 pushout( out, NewLine, 0 );
336                                 pushout( out, NewLine, 0 );
337                                 lenbuf++;
338                         } else {
339                                 state=INTXT;
340                                 pushout(out, Char, ' ');
341                                 ungetc(ch,in);
342                         }
343                 } else if ( state==WAITAFTERRED ) {
344                         if ( !isspace(ch) ) {
345                                 ungetc(ch,in);
346                                 state=INTXT;
347                         }
348                 } else if ( state==COMMENTBEGIN1 ) {
349                         if ( ch == '-' ) {
350                                 state = COMMENTBEGIN2;
351                         } else {
352                                 pushoutstr(out, "<!", 2);
353                                 ungetc(ch,in);
354                                 state=INTXT;
355                         }
356                 } else if ( state==COMMENTBEGIN2 ) {
357                         if ( ch == '-' ) {
358                                 state = COMMENTIN;
359                         } else {
360                                 pushoutstr(out, "<!-", 2);
361                                 ungetc(ch,in);
362                                 state=INTXT;
363                         }
364                 } else if ( state==COMMENTIN ) {
365                         if ( ch == '-' ) 
366                                 state = COMMENTEND1;
367                 } else if ( state==COMMENTEND1 ) {
368                         state = ( ch == '-' ) ? COMMENTEND2 : COMMENTIN;
369                 } else if ( state==COMMENTEND2 ) {
370                         if ( ch == '>' )
371                                 state = INTXT;
372                         else if ( ch != '-' )
373                                 state = COMMENTIN;
374                 } else if ( state==INQUOTE ) {
375                         if ( isalpha( ch ) && lenbuf < BUFFERLENGTH-2 ) {
376                                 buf[ lenbuf ] = ch;
377                                 lenbuf++;
378                         } else if ( ch == ';' && lenbuf>1 && pushoutquot( out, buf+1, lenbuf-1 ) ) {
379                                 state = INTXT;
380                         } else {
381                                 pushoutstr(out, buf, lenbuf);
382                                 state = INTXT;
383                                 ungetc(ch,in);
384                         }
385                 } else {
386                         printf("Unknown state: %d\n", state);
387                         exit(1);
388                 }
389         }
390                         
391         if ( in!=stdin )
392                 fclose(in);
393         if ( out!=stdout )
394                 fclose(out);
395
396         return 0;       
397 }
398