add .gitignore
[tedtools.git] / sfxtest.c
1 /*
2  * Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the author nor the names of any co-contributors
14  *      may be used to endorse or promote products derived from this software
15  *      without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <ctype.h>
35 #include <sys/time.h>
36
37 #include "tools.h"
38 #include "tlog.h"
39 #include "tmalloc.h"
40 #include "sfxstr.h"
41
42 extern char *optarg;
43 static int verbose=1;
44
45 static int
46 STRINGCMP(const void *a, const void *b) {
47         return strcmp( *(char**)a, *(char**)b );
48 }
49
50 static void
51 usage() {
52         puts("Usage:");
53         puts("./sfxtest -d datafile [-b| [-a addondatafile] [-P [-R] [-D] [-l | -p | -r | -g] [-n]] [-q]");
54         puts("Detailed description:");
55
56         puts("  Binary search");
57         puts("    ./sfxtest -b -d datafile [-q]");
58         puts("       -q - quiet (no output)");
59
60         puts("  Optimal suffix search");
61         puts("    ./sfxtest -d datafile [-a addondatafile] [-P [-R]] [-D] [-l | -p | -r | -g]  [-n] [-q]");
62         puts("       -a addondatafile - addon file data for loading");
63         puts("       -g - histogramm mode");
64         puts("       -l - listing mode");
65         puts("       -p - prefix search mode");
66         puts("       -r - range search mode");
67         puts("       -n - enumerate entries");
68         puts("       -P - use plain memory");
69         puts("       -R - use plain memory");
70         puts("       -D - dump/load tree");
71         puts("       -q - quiet (no output)");
72         
73         exit(1);
74 }
75
76 static void
77 gistogramm(SFSNode *node, int *gist) {
78         if ( !node )
79                 return;
80
81       if ( node->isskip ) {
82                if ( node->haschild ) {
83                        gist[1]++;
84                        gistogramm( *(SFSNode**)(node->data), gist );
85                } else
86                       gist[0]++;
87        } else {
88                SFSNodeData *data;
89
90                gist[ node->nchild ]++;
91
92                data=node->data;
93                while(data - node->data < node->nchar) {
94                        if ( data->haschild )
95                                gistogramm( *(SFSNode**)( ((char*)data) + data->child ), gist);
96                        data++;
97                }
98         }
99 }
100
101 int
102 main(int argn, char *argv[]) {
103         struct timeval begin,end;
104         char *datafile=NULL;
105         int i, binary=0;
106         FILE *in;
107         char buf[4096];
108         double sumtime=0.0;
109         int structsize=0, list=0, prefix=0, range=0, enumerate=0,gmode=0, plain=0, revert=0, dump=0;
110         char *addondatafile=NULL;
111
112         int len=4096, curlen=0, checked=0, success=0;
113         char **data;
114         void *res;
115         
116         while ((i = getopt(argn, argv, "pd:bqha:lrngPRD")) != EOF) {
117                 switch(i) {
118                         case 'a':
119                                 addondatafile=optarg;
120                                 break;
121                         case 'd':
122                                 datafile=optarg;
123                                 break;
124                         case 'n':
125                                 enumerate=1;
126                                 break;
127                         case 'g':
128                                 gmode=1;
129                                 break;
130                         case 'r':
131                                 range=1;
132                                 break;
133                         case 'p':
134                                 prefix=1;
135                                 break;
136                         case 'l':
137                                 list=1;
138                                 break;
139                         case 'b':
140                                 binary=1;
141                                 break;
142                         case 'q':
143                                 verbose=0;
144                                 break;
145                         case 'P':
146                                 plain=1;
147                                 break;
148                         case 'R':
149                                 revert=1;
150                                 break;
151                         case 'D':
152                                 dump=1;
153                                 break;
154                         case 'h':
155                         default: 
156                                 usage();
157                 }
158         }
159
160         if (!datafile) usage();
161                 
162         opentlog(TL_OPEN_STDERR|TL_OPEN_SYSLOG|TL_OPEN_FILE, TL_INFO, "./sfxtest.log");
163         if ( (in=fopen(datafile,"r"))==NULL ) 
164                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", datafile);
165
166         data=(char**)tmalloc(len*sizeof(char*));
167         while(fgets(buf,4096,in)) {
168                 structsize+=clrspace(buf)+1;
169                 if ( !*buf ) continue;
170                 if (curlen+2 > len) {
171                         len*=2;
172                         data=(char**)trealloc(data, len*sizeof(char*));
173                 }
174                 data[curlen]=tstrdup(buf);
175                 curlen++;
176         }
177         fclose(in);
178         data[curlen]=NULL;
179         structsize+=sizeof(char*)*curlen;
180
181         if ( binary == 1 ) {
182                 gettimeofday( &begin, NULL );
183                 qsort(data, curlen, sizeof(char**), STRINGCMP);
184                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
185                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)structsize)/(1024.0*1024.0));
186
187                 gettimeofday( &begin, NULL );
188                 while(fgets(buf,4096,stdin)) {
189                         char *ptr=buf;
190                         len = clrspace(buf);
191                         if (!len) continue;
192                         res = bsearch(&ptr, data, curlen, sizeof(char**), STRINGCMP);
193                         
194                         if (verbose) 
195                                 puts( (res) ? "Y" : "N" );
196
197                         checked++;
198                         if (res) success++;
199                 }
200                 gettimeofday( &end, NULL );
201         } else {
202                 SFSTree info;
203                 SFSDataIO n = {NULL,0,NULL};
204                 n.data = (void*)&enumerate;
205
206                 gettimeofday( &begin, NULL );
207                 if (enumerate) {
208                         char **ptr=data;
209
210                         SFSInit_dp(&info,sizeof(enumerate),NULL);
211                         while(*ptr) {
212                                 n.key = *ptr;
213                                 n.keylen=0;
214                                 SFSAdd(&info, &n);
215                                 enumerate++;
216                                 ptr++;
217                         }
218                 } else
219                         SFSInit_c(&info,data);
220                 tlog(TL_INFO,"Init time: %.03f secs",  elapsedtime(&begin) );
221                 tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
222                 tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
223
224                 for(i=0;i<curlen;i++)
225                         tfree(data[i]);
226                 tfree(data);
227
228                 if ( addondatafile ) {
229                         int cntadd=0;
230
231                         if ( (in=fopen(addondatafile,"r"))==NULL ) 
232                                 tlog(TL_CRIT|TL_EXIT,"Beda with %s", addondatafile);
233                         n.key=buf;
234
235                         gettimeofday( &begin, NULL );
236                         while(fgets(buf,4096,in)) {
237                                 if ( !clrspace(buf) ) continue;
238                                 n.keylen=0;
239                                 SFSAdd(&info, &n);
240                                 if (enumerate) enumerate++;
241                                 cntadd++;
242                         }
243                         gettimeofday( &end, NULL );
244                         fclose(in);
245                         sumtime = timediff(&begin,&end);
246                         tlog(TL_INFO,"Add time: %.03f secs,   %.03f per second", sumtime, ((double)cntadd)/sumtime);
247                         tlog(TL_INFO,"Added item: %d", cntadd);
248                         tlog(TL_INFO,"Memory allocated: %.2fMb", ((float)info.totalen)/(1024.0*1024.0));
249                         tlog(TL_INFO,"Number of nodes: %d", info.nnodes);
250                         curlen+=cntadd;
251                 } 
252
253                 if ( plain && !gmode ) { 
254                         SFSMakePlain(&info, NULL);
255                         if ( revert )
256                                 tfree( SFSRevertPlain(&info) );
257                 }
258
259                 if ( dump ) {
260                         int             n = 0xDEAFBEAF;
261                         u_int32_t       nz;
262                         void            *pn;
263
264                         SFSWriteDump(&info, "./sfxtest.dump", &n, sizeof(n) );
265                         SFSFree(&info,NULL);
266                         SFSReadDump(&info, "./sfxtest.dump", &pn, &nz);
267
268                         tassert(*(int*)pn == n);
269                         tassert(sizeof(n) == nz);
270                 }
271
272                 if (list) {
273                         SFSDataIO out;
274
275                         gettimeofday( &begin, NULL );
276                         SFSIteratorStart(&info);
277                         while( SFSIterate(&info, &out) ) { 
278                                 if ( verbose ) {
279                                         if (enumerate) {
280                                                 fputs(out.key, stdout);
281                                                 printf(" %d\n", *(int*)(out.data));
282                                         } else
283                                                 puts(out.key);
284                                 }
285                                 checked++; success++;
286                         }
287                         gettimeofday( &end, NULL );
288                 } else if ( gmode ) {
289                         int Gist[256];
290    
291                         memset(Gist,0,sizeof(int)*256);
292                         gettimeofday( &begin, NULL );
293                         gistogramm(info.node, Gist);
294                         gettimeofday( &end, NULL );
295                         for(i=0;i<256;i++)
296                                 printf("%d\t%d\n",i,Gist[i]);
297                 } else if ( prefix ) {
298                         SFSDataIO out;
299                         
300                         gettimeofday( &begin, NULL );
301                         while(fgets(buf,4096,stdin)) {
302                                 len = clrspace(buf);
303                                 if (!len) continue;
304
305                                 SFSPrefixIteratorStart(&info,buf);
306                                 while( SFSIterate(&info, &out)) { 
307                                         if (verbose) { 
308                                                 if (enumerate) {
309                                                         putchar('>'); 
310                                                         fputs(out.key, stdout);
311                                                         printf(" %d\n", *(int*)(out.data));
312                                                 } else {
313                                                         putchar('>'); 
314                                                         puts(out.key);
315                                                 }
316                                         }
317                                         success++;
318                                 }       
319                                 checked++;
320                         }
321                         gettimeofday( &end, NULL );
322                 } else if ( range ) {
323                         SFSDataIO f,l;
324                         
325                         gettimeofday( &begin, NULL );
326                         while(fgets(buf,4096,stdin)) {
327                                 len = clrspace(buf);
328                                 if (!len) continue;
329
330                                 if ( SFSRange(&info,buf,&f,&l) ) {
331                                         if (verbose) { 
332                                                 if (enumerate) {
333                                                         putchar('>'); 
334                                                         fputs(f.key, stdout);
335                                                         printf(" %d\n", *(int*)(f.data));
336                                                         putchar('>'); 
337                                                         fputs(l.key, stdout);
338                                                         printf(" %d\n", *(int*)(l.data));
339                                                 } else {
340                                                         putchar('>'); puts(f.key);
341                                                         putchar('>'); puts(l.key);
342                                                 }
343                                         }
344                                         success++;
345                                 }       
346                                 checked++;
347                         }
348                         gettimeofday( &end, NULL );
349                 } else {
350                         gettimeofday( &begin, NULL );
351                         while(fgets(buf,4096,stdin)) {
352                                 len = clrspace(buf);
353                                 if (!len) continue;
354
355                                 res = SFSFindData(&info,buf,0);
356                                 if (verbose) {
357                                         if (enumerate && res)
358                                                 printf("%d\n", *(int*)(res));
359                                         else 
360                                                 puts( (res) ? "Y" : "N" );
361                                 }
362
363                                 checked++;
364                                 if (res) success++;
365                         }
366                         gettimeofday( &end, NULL );
367                 }
368                 SFSFree(&info,NULL);
369         }
370
371         sumtime = timediff(&begin,&end);
372         tlog(TL_INFO,"Total execution time: %.03f secs", sumtime);
373         tlog(TL_INFO,"Total words in data: %d; Checked: %d; Success: %d", curlen, checked, success);
374         tlog(TL_INFO,"%.2f words per second", ((double)checked)/sumtime);
375         
376         closetlog();
377         return 0;
378 }