add one more cycle
[tedtools.git] / sfxtest.c
1 /*
2  * Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the author nor the names of any co-contributors
14  *      may be used to endorse or promote products derived from this software
15  *      without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <ctype.h>
35 #include <sys/time.h>
36
37 #include "tools.h"
38 #include "tlog.h"
39 #include "tmalloc.h"
40 #include "sfxstr.h"
41
42 extern char *optarg;
43 static int verbose=1;
44
45 static int
46 STRINGCMP(const void *a, const void *b) {
47         return strcmp( *(char**)a, *(char**)b );
48 }
49
50 static void
51 usage() {
52         puts("Usage:");
53         puts("./sfxtest -d datafile [-b| [-a addondatafile] [-l | -p | -r | -g] [-n]] [-q]");
54         puts("Detailed description:");
55
56         puts("  Binary search");
57         puts("    ./sfxtest -b -d datafile [-q]");
58         puts("       -q - quiet (no output)");
59
60         puts("  Optimal suffix search");
61         puts("    ./sfxtest -d datafile [-a addondatafile] [-l | -p | -r | -g]  [-n] [-q]");
62         puts("       -a addondatafile - addon file data for loading");
63         puts("       -g - histogramm mode");
64         puts("       -l - listing mode");
65         puts("       -p - prefix search mode");
66         puts("       -r - range search mode");
67         puts("       -n - enumerate entries");
68         puts("       -q - quiet (no output)");
69         
70         exit(1);
71 }
72
73 static void
74 gistogramm(SFSNode *node, int *gist) {
75         if ( !node )
76                 return;
77
78       if ( node->isskip ) {
79                if ( node->haschild ) {
80                        gist[1]++;
81                        gistogramm( *(SFSNode**)(node->data), gist );
82                } else
83                       gist[0]++;
84        } else {
85                SFSNodeData *data;
86
87                gist[ node->nchild ]++;
88
89                data=node->data;
90                while(data - node->data < node->nchar) {
91                        if ( data->haschild )
92                                gistogramm( *(SFSNode**)( ((char*)data) + data->child ), gist);
93                        data++;
94                }
95         }
96 }
97
98 int
99 main(int argn, char *argv[]) {
100         struct timeval begin,end;
101         char *datafile=NULL;
102         int i, binary=0;
103         FILE *in;
104         char buf[4096];
105         double sumtime=0.0;
106         int structsize=0, list=0, prefix=0, range=0, enumerate=0,gmode=0;
107         char *addondatafile=NULL;
108
109         int len=4096, curlen=0, checked=0, success=0;
110         char **data;
111         void *res;
112         
113         while ((i = getopt(argn, argv, "pd:bqha:lrng")) != EOF) {
114                 switch(i) {
115                         case 'a':
116                                 addondatafile=optarg;
117                                 break;
118                         case 'd':
119                                 datafile=optarg;
120                                 break;
121                         case 'n':
122                                 enumerate=1;
123                                 break;
124                         case 'g':
125                                 gmode=1;
126                                 break;
127                         case 'r':
128                                 range=1;
129                                 break;
130                         case 'p':
131                                 prefix=1;
132                                 break;
133                         case 'l':
134                                 list=1;
135                                 break;
136                         case 'b':
137                                 binary=1;
138                                 break;
139                         case 'q':
140                                 verbose=0;
141                                 break;
142                         case 'h':
143                         default: 
144                                 usage();
145                 }
146         }
147
148         if (!datafile) usage();
149
150         opentlog(TL_OPEN_STDERR|TL_OPEN_SYSLOG|TL_OPEN_FILE, TL_INFO, "./sfxtest.log");
151         if ( (in=fopen(datafile,"r"))==NULL ) 
152                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", datafile);
153
154         data=(char**)tmalloc(len*sizeof(char*));
155         while(fgets(buf,4096,in)) {
156                 structsize+=clrspace(buf)+1;
157                 if ( !*buf ) continue;
158                 if (curlen+2 > len) {
159                         len*=2;
160                         data=(char**)trealloc(data, len*sizeof(char*));
161                 }
162                 data[curlen]=tstrdup(buf);
163                 curlen++;
164         }
165         fclose(in);
166         data[curlen]=NULL;
167         structsize+=sizeof(char*)*curlen;
168
169         if ( binary == 1 ) {
170                 gettimeofday( &begin, NULL );
171                 qsort(data, curlen, sizeof(char**), STRINGCMP);
172                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
173                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)structsize)/(1024.0*1024.0));
174
175                 gettimeofday( &begin, NULL );
176                 while(fgets(buf,4096,stdin)) {
177                         char *ptr=buf;
178                         len = clrspace(buf);
179                         if (!len) continue;
180                         res = bsearch(&ptr, data, curlen, sizeof(char**), STRINGCMP);
181                         
182                         if (verbose) 
183                                 puts( (res) ? "Y" : "N" );
184
185                         checked++;
186                         if (res) success++;
187                 }
188                 gettimeofday( &end, NULL );
189         } else {
190                 SFSTree info;
191                 SFSDataIO n = {NULL,0,NULL};
192                 n.data = (void*)&enumerate;
193
194                 gettimeofday( &begin, NULL );
195                 if (enumerate) {
196                         char **ptr=data;
197
198                         SFSInit_dp(&info,sizeof(enumerate),NULL);
199                         while(*ptr) {
200                                 n.key = *ptr;
201                                 n.keylen=0;
202                                 SFSAdd(&info, &n);
203                                 enumerate++;
204                                 ptr++;
205                         }
206                 } else
207                         SFSInit_c(&info,data);
208                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
209                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
210                 tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
211
212                 for(i=0;i<curlen;i++)
213                         tfree(data[i]);
214                 tfree(data);
215
216                 if ( addondatafile ) {
217                         int cntadd=0;
218
219                         if ( (in=fopen(addondatafile,"r"))==NULL ) 
220                                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", addondatafile);
221                         n.key=buf;
222
223                         gettimeofday( &begin, NULL );
224                         while(fgets(buf,4096,in)) {
225                                 if ( !clrspace(buf) ) continue;
226                                 n.keylen=0;
227                                 SFSAdd(&info, &n);
228                                 if (enumerate) enumerate++;
229                                 cntadd++;
230                         }
231                         gettimeofday( &end, NULL );
232                         fclose(in);
233                         sumtime = timediff(&begin,&end);
234                         tlog(TL_INFO,"Add time: %.03f secs,   %.03f per second", sumtime, ((double)cntadd)/sumtime);
235                         tlog(TL_INFO,"Added item: %d", cntadd);
236                         tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
237                         tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
238                         curlen+=cntadd;
239                 }
240
241                 if (list) {
242                         SFSDataIO out;
243
244                         gettimeofday( &begin, NULL );
245                         SFSIteratorStart(&info);
246                         while( SFSIterate(&info, &out) ) { 
247                                 if ( verbose ) {
248                                         if (enumerate) {
249                                                 fputs(out.key, stdout);
250                                                 printf(" %d\n", *(int*)(out.data));
251                                         } else
252                                                 puts(out.key);
253                                 }
254                                 checked++; success++;
255                         }
256                         gettimeofday( &end, NULL );
257                 } else if ( gmode ) {
258                         int Gist[256];
259    
260                         memset(Gist,0,sizeof(int)*256);
261                         gettimeofday( &begin, NULL );
262                         gistogramm(info.node, Gist);
263                         gettimeofday( &end, NULL );
264                         for(i=0;i<256;i++)
265                                 printf("%d\t%d\n",i,Gist[i]);
266                 } else if ( prefix ) {
267                         SFSDataIO out;
268                         
269                         gettimeofday( &begin, NULL );
270                         while(fgets(buf,4096,stdin)) {
271                                 len = clrspace(buf);
272                                 if (!len) continue;
273
274                                 SFSPrefixIteratorStart(&info,buf);
275                                 while( SFSIterate(&info, &out)) { 
276                                         if (verbose) { 
277                                                 if (enumerate) {
278                                                         putchar('>'); 
279                                                         fputs(out.key, stdout);
280                                                         printf(" %d\n", *(int*)(out.data));
281                                                 } else {
282                                                         putchar('>'); 
283                                                         puts(out.key);
284                                                 }
285                                         }
286                                         success++;
287                                 }       
288                                 checked++;
289                         }
290                         gettimeofday( &end, NULL );
291                 } else if ( range ) {
292                         SFSDataIO f,l;
293                         
294                         gettimeofday( &begin, NULL );
295                         while(fgets(buf,4096,stdin)) {
296                                 len = clrspace(buf);
297                                 if (!len) continue;
298
299                                 if ( SFSRange(&info,buf,&f,&l) ) {
300                                         if (verbose) { 
301                                                 if (enumerate) {
302                                                         putchar('>'); 
303                                                         fputs(f.key, stdout);
304                                                         printf(" %d\n", *(int*)(f.data));
305                                                         putchar('>'); 
306                                                         fputs(l.key, stdout);
307                                                         printf(" %d\n", *(int*)(l.data));
308                                                 } else {
309                                                         putchar('>'); puts(f.key);
310                                                         putchar('>'); puts(l.key);
311                                                 }
312                                         }
313                                         success++;
314                                 }       
315                                 checked++;
316                         }
317                         gettimeofday( &end, NULL );
318                 } else {
319                         gettimeofday( &begin, NULL );
320                         while(fgets(buf,4096,stdin)) {
321                                 len = clrspace(buf);
322                                 if (!len) continue;
323
324                                 res = SFSFindData(&info,buf);
325                                 if (verbose) {
326                                         if (enumerate && res)
327                                                 printf("%d\n", *(int*)(res));
328                                         else 
329                                                 puts( (res) ? "Y" : "N" );
330                                 }
331
332                                 checked++;
333                                 if (res) success++;
334                         }
335                         gettimeofday( &end, NULL );
336                 }
337                 SFSFree(&info,NULL);
338         }
339
340         sumtime = timediff(&begin,&end);
341         tlog(TL_INFO,"Total execution time: %.03f secs", sumtime);
342         tlog(TL_INFO,"Total words in data: %d; Checked: %d; Success: %d", curlen, checked, success);
343         tlog(TL_INFO,"%.2f words per second", ((double)checked)/sumtime);
344         
345         closetlog();
346         return 0;
347 }