2 * Copyright (c) 2006 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 * Utility for filtering lex file from stop words
40 #define TXTBUFLEN 4096
49 comparestr(const void *a, const void *b)
51 return strcasecmp(*(char **) a, *(char **) b);
55 readstoplist(char *filename, StopList * s)
63 if ((hin = fopen(filename, "r")) == NULL) {
64 fprintf(stderr,"Can't open %s: %s\n", filename, strerror(errno));
68 while (fgets(buf, TXTBUFLEN, hin))
70 buf[strlen(buf) - 1] = '\0';
74 if (s->len >= reallen)
78 reallen = (reallen) ? reallen * 2 : 16;
79 tmp = (char **) realloc((void *) stop, sizeof(char *) * reallen);
82 fprintf(stderr,"Not enough memory\n");
88 stop[s->len] = strdup(buf);
91 fprintf(stderr,"Not enough memory\n");
100 if (s->stop && s->len > 1)
101 qsort(s->stop, s->len, sizeof(char *), comparestr);
105 searchstoplist(StopList * s, char *key)
107 if ( strlen(key) <=4 )
109 return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? 1 : 0;
113 main(int argn, char *argv[]) {
115 StopList sl={0,NULL};
118 fprintf(stderr,"Usage: %s stopfile < lex\n", argv[0]);
122 readstoplist(argv[1], &sl);
124 while( fgets(buf, TXTBUFLEN, stdin) ) {
128 if ( sscanf( buf, "%s %d", wrd, &occur )!= 2)
131 if ( searchstoplist(&sl, wrd) || occur <=0 )
134 printf("%s %d\n", wrd, occur);