Initial revision
[ftsbench.git] / stopfilter.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <errno.h>
4 #include <string.h>
5
6 /*
7  * Utility for filtering lex file from stop words
8  */
9
10
11 #define TXTBUFLEN       4096
12
13 typedef struct
14 {
15     int         len;
16         char      **stop;
17 }   StopList;
18
19 static int
20 comparestr(const void *a, const void *b)
21 {
22         return strcasecmp(*(char **) a, *(char **) b);
23 }
24
25 static void
26 readstoplist(char *filename, StopList * s)
27 {
28         char      **stop = NULL;
29         FILE       *hin;
30         char            buf[TXTBUFLEN];
31         int                     reallen = 0;
32
33         s->len = 0;
34         if ((hin = fopen(filename, "r")) == NULL) {
35                 fprintf(stderr,"Can't open %s: %s\n", filename, strerror(errno));
36                 exit(1);
37         }
38
39         while (fgets(buf, TXTBUFLEN, hin))
40         {
41                 buf[strlen(buf) - 1] = '\0';
42                 if (*buf == '\0')
43                         continue;
44
45                 if (s->len >= reallen)
46                 {
47                         char      **tmp;
48
49                         reallen = (reallen) ? reallen * 2 : 16;
50                         tmp = (char **) realloc((void *) stop, sizeof(char *) * reallen);
51                         if (!tmp)
52                         {
53                                 fprintf(stderr,"Not enough memory");
54                                 exit(1);
55                         }
56                         stop = tmp;
57                 }
58
59                 stop[s->len] = strdup(buf);
60                 if (!stop[s->len])
61                 {
62                         fprintf(stderr,"Not enough memory");
63                         exit(1);
64                 }
65
66                 (s->len)++;
67         }
68         fclose(hin);
69         s->stop = stop;
70
71         if (s->stop && s->len > 1)
72                 qsort(s->stop, s->len, sizeof(char *), comparestr);
73 }
74
75 static int
76 searchstoplist(StopList * s, char *key)
77 {
78         if ( strlen(key) <=4 )
79                 return 1;
80         return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? 1 : 0;
81 }
82
83 int 
84 main(int argn, char *argv[]) {
85         char    buf[TXTBUFLEN];
86         StopList        sl={0,NULL};
87
88         if ( argn != 2 ) {
89                 fprintf(stderr,"Usage: %s stopfile < lex\n", argv[0]);
90                 exit(1);
91         }
92
93         readstoplist(argv[1], &sl);
94
95         while( fgets(buf, TXTBUFLEN, stdin) ) {
96                 char    wrd[TXTBUFLEN];
97                 int             occur;
98
99                 if ( sscanf( buf, "%s %d", wrd, &occur )!= 2)  
100                         continue;
101
102                 if ( searchstoplist(&sl, wrd) || occur <=0 )
103                         continue;
104
105                 printf("%s %d\n", wrd, occur);
106         }
107
108         return 0;
109 }