finnegan.c

   1 /* This program generates documents of random lengths with random words */
   2 /* using the frequencies of words and document lengths obtained from    */
   3 /* the Wall Street Journal(1985 - 1990) in order to create databases    */
   4 /* with similar stastitical properties to real text.  Databases can be  */
   5 /* created in mg or atlas format.                                       */
   6
   7 /* Author: D. Psaradellis, December 1994 */
   8
   9 /* Modified by J. Zobel, July 1995. */
  10
  11
  12 /*
  13  * Copyright RMIT and The University of Melbourne.  This code may not be
  14  * further distributed without the permission of J. Zobel.
  15  */
  16
  17 /*
  18  * Cleanup and nonsignificant changes by teodor <teodor@sigaev.ru>
  19  */
  20
  21 #include <stdio.h>
  22 #include <stdlib.h>
  23 #include <unistd.h>
  24 #include <string.h>
  25
  26 #include "ftsbench.h"
  27
  28 #define INITIAL_SEED 73802      /* initialise seed */
  29 #define linelen      78         /* line length */
  30 #define BUFLEN       1024
  31
  32 struct word_info {
  33         char           *word;   /* word */
  34         int             freq;   /* word's frequency */
  35         int             cum_freq;       /* cummulative freq - sum of previous
  36                                          * freq and cum_freq */
  37         short           word_len;       /* length of word in bytes */
  38 };
  39
  40 struct doc_info {
  41         int             doc_len;/* document length */
  42         int             doc_freq;       /* document frequency */
  43         int             doc_cfreq;      /* cummulative frequency */
  44 };
  45
  46 static struct word_info *a;
  47 static struct doc_info *d;
  48 static int              no_of_words = 0;
  49 static int              word_occur = 0;
  50 static int              no_of_docs = 0;
  51 static int              doc_occur = 0;
  52 static char             buf       [BUFLEN];
  53 static int              isInited = 0;
  54
  55 /* This function calculates the no. of words/doclens in the file, */
  56 /* by counting the no. of newlines                                */
  57 static int
  58 no_newline(char *filename)
  59 {
  60         FILE           *fp;
  61         int             c         , cnt = 0;
  62
  63         if ((fp = fopen(filename, "r")) == NULL) {
  64                 fprintf(stderr,"Cannot open %s\n", filename);
  65                 exit(1);
  66         }
  67         while ((c = getc(fp)) != EOF)
  68                 if (c == '\n')
  69                         cnt += 1;
  70         fclose(fp);
  71         return cnt;
  72 }
  73
  74 /* This function dynamically allocates memory to store the words, their */
  75 /* frequency, their cummulative frequency and their length in bytes     */
  76 static int
  77 build_word_array(char *filename)
  78 {
  79         FILE           *fp;
  80         int             i         , temp_cfreq = 0;
  81
  82         if ((fp = fopen(filename, "r")) == NULL) {
  83                 fprintf(stderr,"Cannot open %s\n", filename);
  84                 exit(1);
  85         }
  86         a = (struct word_info *)malloc(no_of_words * sizeof(struct word_info));
  87         if (!a) {
  88                 fprintf(stderr,"Can't allocate %d bytes\n", no_of_words * sizeof(struct word_info));
  89                 exit(1);
  90         }
  91         for (i = 0; i < no_of_words; i++) {
  92                 fscanf(fp, "%s", buf);  /* store word in temporary buffer */
  93                 a[i].word = strdup(buf);
  94                 if (!a[i].word) {
  95                         fprintf(stderr,"strdup failed\n");
  96                         exit(1);
  97                 }
  98                 fscanf(fp, "%d", &a[i].freq);
  99                 a[i].word_len = strlen(a[i].word);
 100                 a[i].cum_freq = temp_cfreq;
 101                 temp_cfreq += a[i].freq;
 102         }
 103         fclose(fp);
 104         return temp_cfreq;
 105 }
 106
 107 /* This function dynamically allocates memory to store the document lengths, */
 108 /* their frequency and their cummulative frequency                           */
 109 static int
 110 build_doc_array(char *filename)
 111 {
 112         FILE           *fp;
 113         int             i         , temp_cfreq = 0;
 114
 115         if ((fp = fopen(filename, "r")) == NULL) {
 116                 fprintf(stderr,"Cannot open %s\n", filename);
 117                 exit(1);
 118         }
 119         d = (struct doc_info *)malloc(no_of_docs * sizeof(struct doc_info));
 120         for (i = 0; i < no_of_docs; i++) {
 121                 fscanf(fp, "%d", &d[i].doc_len);
 122                 fscanf(fp, "%d", &d[i].doc_freq);
 123                 d[i].doc_cfreq = temp_cfreq;
 124                 temp_cfreq += d[i].doc_freq;
 125         }
 126         fclose(fp);
 127         return temp_cfreq;
 128 }
 129
 130 /* to locate the index of the word required                           */
 131 static int
 132 binsearch_word(int v)
 133 {
 134         int             l = 0;
 135         int             r = no_of_words;
 136         int             x;
 137
 138         x = (l + r) >> 1;
 139         while (r > l) {
 140                 if (v < a[x].cum_freq)
 141                         r = x - 1;
 142                 else if (v >= a[x].cum_freq + a[x].freq)
 143                         l = x + 1;
 144                 else
 145                         break;
 146                 x = (l + r) >> 1;
 147         }
 148         return (x);
 149 }
 150
 151 /* This function outputs a specified no. of random words taking into */
 152 /* account the linelen                                               */
 153 static void
 154 output_words(StringBuf *b, int words_remain)
 155 {
 156         int             index     , linesize = 0;
 157
 158         while (words_remain > 0) {      /* generate random no. in range of */
 159                 index = binsearch_word(rnd() % word_occur);     /* 0 - word occurrences */
 160                 if ((a[index].word_len + linesize) > linelen) { /* if len of line
 161                                                                  * exceeds */
 162                         sb_add(b, "\n", 1);
 163                         linesize = 0;
 164                 }
 165                 sb_add(b, a[index].word, a[index].word_len);
 166                 sb_add(b, " ", 1);
 167                 linesize += (a[index].word_len + 1);
 168                 words_remain--;
 169         }
 170         if (linesize > 0)
 171                 sb_add(b, "\n", 1);
 172 }
 173
 174 static char*
 175 get_words() {
 176         int     index =  binsearch_word(rnd() % word_occur);
 177         return a[index].word;
 178 }
 179
 180
 181 /* This function uses a binary search algorithm on the doc_cfreq field */
 182 /* to locate the index of the document length required                 */
 183 static int
 184 binsearch_doc(int v)
 185 {
 186         int             l = 0;
 187         int             r = no_of_docs;
 188         int             x;
 189
 190         x = (l + r) >> 2;
 191         while (r > l) {
 192                 if (v < d[x].doc_cfreq)
 193                         r = x - 1;
 194                 else if (v >= d[x].doc_cfreq + d[x].doc_freq)
 195                         l = x + 1;
 196                 else
 197                         break;
 198                 x = (l + r) >> 1;
 199         }
 200         return (x);
 201 }
 202
 203 /* This function uses a binary search algorithm on the cum_freq field */
 204 void
 205 generate_doc(StringBuf *b) {
 206         int     index;
 207
 208         b->strlen = 0;
 209         index = binsearch_doc(rnd() % doc_occur);
 210         output_words( b, d[index].doc_len );
 211 }
 212
 213
 214 char **
 215 generate_querywords() {
 216         int     index;
 217         char **res;
 218         int i;
 219
 220         index = binsearch_doc(rnd() % doc_occur);
 221         res = (char**) malloc(sizeof(char*) * (d[index].doc_len+1));
 222
 223         for(i=0;i<d[index].doc_len;i++)
 224                 res[i] = get_words();
 225         res[i] = NULL;
 226
 227         return res;
 228 }
 229
 230
 231 void
 232 finnegan_init(char *lex_file, char *doc_file, int quiet) {
 233         if ( isInited ) {
 234                 fprintf(stderr,"finnegan is already inited\n");
 235                 exit(1);
 236         }
 237
 238         if (!quiet) {
 239                 printf("Initialize text generator with:\n");
 240                 printf("\tfile '%s' - lexeme's distribution\n", lex_file);
 241                 printf("\tfile '%s' - length's distribution\n", doc_file);
 242         }
 243         srnd(INITIAL_SEED);
 244         no_of_words = no_newline(lex_file);
 245         no_of_docs = no_newline(doc_file);
 246         word_occur = build_word_array(lex_file);
 247         doc_occur = build_doc_array(doc_file);
 248
 249         isInited = 1;
 250         /* output_docs(doc_occur, word_occur); */
 251 }
 252