finnegan.c

   1 /* This program generates documents of random lengths with random words */
   2 /* using the frequencies of words and document lengths obtained from    */
   3 /* the Wall Street Journal(1985 - 1990) in order to create databases    */
   4 /* with similar stastitical properties to real text.  Databases can be  */
   5 /* created in mg or atlas format.                                       */
   6
   7 /* Author: D. Psaradellis, December 1994 */
   8
   9 /* Modified by J. Zobel, July 1995. */
  10
  11
  12 /*
  13  * Copyright RMIT and The University of Melbourne.  This code may not be
  14  * further distributed without the permission of J. Zobel.
  15  */
  16
  17
  18 #include <stdio.h>
  19 #include <stdlib.h>
  20 #include <unistd.h>
  21 #include <string.h>
  22
  23 #include "ftsbench.h"
  24
  25 #define INITIAL_SEED 73802      /* initialise seed */
  26 #define linelen      78         /* line length */
  27 #define BUFLEN       1024
  28
  29 struct word_info {
  30         char           *word;   /* word */
  31         int             freq;   /* word's frequency */
  32         int             cum_freq;       /* cummulative freq - sum of previous
  33                                          * freq and cum_freq */
  34         short           word_len;       /* length of word in bytes */
  35 };
  36
  37 struct doc_info {
  38         int             doc_len;/* document length */
  39         int             doc_freq;       /* document frequency */
  40         int             doc_cfreq;      /* cummulative frequency */
  41 };
  42
  43 static struct word_info *a;
  44 static struct doc_info *d;
  45 static int              no_of_words = 0;
  46 static int              word_occur = 0;
  47 static int              no_of_docs = 0;
  48 static int              doc_occur = 0;
  49 static char             buf       [BUFLEN];
  50 static int              isInited = 0;
  51
  52 /* This function calculates the no. of words/doclens in the file, */
  53 /* by counting the no. of newlines                                */
  54 static int
  55 no_newline(char *filename)
  56 {
  57         FILE           *fp;
  58         int             c         , cnt = 0;
  59
  60         if ((fp = fopen(filename, "r")) == NULL) {
  61                 fprintf(stderr,"Cannot open %s\n", filename);
  62                 exit(1);
  63         }
  64         while ((c = getc(fp)) != EOF)
  65                 if (c == '\n')
  66                         cnt += 1;
  67         fclose(fp);
  68         return cnt;
  69 }
  70
  71 /* This function dynamically allocates memory to store the words, their */
  72 /* frequency, their cummulative frequency and their length in bytes     */
  73 static int
  74 build_word_array(char *filename)
  75 {
  76         FILE           *fp;
  77         int             i         , temp_cfreq = 0;
  78
  79         if ((fp = fopen(filename, "r")) == NULL) {
  80                 fprintf(stderr,"Cannot open %s\n", filename);
  81                 exit(1);
  82         }
  83         a = (struct word_info *)malloc(no_of_words * sizeof(struct word_info));
  84         if (!a) {
  85                 fprintf(stderr,"Can't allocate %d bytes\n", no_of_words * sizeof(struct word_info));
  86                 exit(1);
  87         }
  88         for (i = 0; i < no_of_words; i++) {
  89                 fscanf(fp, "%s", buf);  /* store word in temporary buffer */
  90                 a[i].word = strdup(buf);
  91                 if (!a[i].word) {
  92                         fprintf(stderr,"strdup failed\n");
  93                         exit(1);
  94                 }
  95                 fscanf(fp, "%d", &a[i].freq);
  96                 a[i].word_len = strlen(a[i].word);
  97                 a[i].cum_freq = temp_cfreq;
  98                 temp_cfreq += a[i].freq;
  99         }
 100         fclose(fp);
 101         return temp_cfreq;
 102 }
 103
 104 /* This function dynamically allocates memory to store the document lengths, */
 105 /* their frequency and their cummulative frequency                           */
 106 static int
 107 build_doc_array(char *filename)
 108 {
 109         FILE           *fp;
 110         int             i         , temp_cfreq = 0;
 111
 112         if ((fp = fopen(filename, "r")) == NULL) {
 113                 fprintf(stderr,"Cannot open %s\n", filename);
 114                 exit(1);
 115         }
 116         d = (struct doc_info *)malloc(no_of_docs * sizeof(struct doc_info));
 117         for (i = 0; i < no_of_docs; i++) {
 118                 fscanf(fp, "%d", &d[i].doc_len);
 119                 fscanf(fp, "%d", &d[i].doc_freq);
 120                 d[i].doc_cfreq = temp_cfreq;
 121                 temp_cfreq += d[i].doc_freq;
 122         }
 123         fclose(fp);
 124         return temp_cfreq;
 125 }
 126
 127 /* to locate the index of the word required                           */
 128 static int
 129 binsearch_word(int v)
 130 {
 131         int             l = 0;
 132         int             r = no_of_words;
 133         int             x;
 134
 135         x = (l + r) >> 1;
 136         while (r > l) {
 137                 if (v < a[x].cum_freq)
 138                         r = x - 1;
 139                 else if (v >= a[x].cum_freq + a[x].freq)
 140                         l = x + 1;
 141                 else
 142                         break;
 143                 x = (l + r) >> 1;
 144         }
 145         return (x);
 146 }
 147
 148 /* This function outputs a specified no. of random words taking into */
 149 /* account the linelen                                               */
 150 static void
 151 output_words(StringBuf *b, int words_remain)
 152 {
 153         int             index     , linesize = 0;
 154
 155         while (words_remain > 0) {      /* generate random no. in range of */
 156                 index = binsearch_word(rnd() % word_occur);     /* 0 - word occurrences */
 157                 if ((a[index].word_len + linesize) > linelen) { /* if len of line
 158                                                                  * exceeds */
 159                         sb_add(b, "\n", 1);
 160                         linesize = 0;
 161                 }
 162                 sb_add(b, a[index].word, a[index].word_len);
 163                 sb_add(b, " ", 1);
 164                 linesize += (a[index].word_len + 1);
 165                 words_remain--;
 166         }
 167         if (linesize > 0)
 168                 sb_add(b, "\n", 1);
 169 }
 170
 171 static char*
 172 get_words() {
 173         int     index =  binsearch_word(rnd() % word_occur);
 174         return a[index].word;
 175 }
 176
 177
 178 /* This function uses a binary search algorithm on the doc_cfreq field */
 179 /* to locate the index of the document length required                 */
 180 static int
 181 binsearch_doc(int v)
 182 {
 183         int             l = 0;
 184         int             r = no_of_docs;
 185         int             x;
 186
 187         x = (l + r) >> 2;
 188         while (r > l) {
 189                 if (v < d[x].doc_cfreq)
 190                         r = x - 1;
 191                 else if (v >= d[x].doc_cfreq + d[x].doc_freq)
 192                         l = x + 1;
 193                 else
 194                         break;
 195                 x = (l + r) >> 1;
 196         }
 197         return (x);
 198 }
 199
 200 /* This function uses a binary search algorithm on the cum_freq field */
 201 void
 202 generate_doc(StringBuf *b) {
 203         int     index;
 204
 205         b->strlen = 0;
 206         index = binsearch_doc(rnd() % doc_occur);
 207         output_words( b, d[index].doc_len );
 208 }
 209
 210
 211 char **
 212 generate_querywords() {
 213         int     index;
 214         char **res;
 215         int i;
 216
 217         index = binsearch_doc(rnd() % doc_occur);
 218         res = (char**) malloc(sizeof(char*) * (d[index].doc_len+1));
 219
 220         for(i=0;i<d[index].doc_len;i++)
 221                 res[i] = get_words();
 222         res[i] = NULL;
 223
 224         return res;
 225 }
 226
 227
 228 void
 229 finnegan_init(char *lex_file, char *doc_file) {
 230         if ( isInited ) {
 231                 fprintf(stderr,"finnegan is already inited\n");
 232                 exit(1);
 233         }
 234
 235         printf("Initialize text generator with:\n");
 236         printf("\tfile '%s' - lexeme's distribution\n", lex_file);
 237         printf("\tfile '%s' - length's distribution\n", doc_file);
 238         srnd(INITIAL_SEED);
 239         no_of_words = no_newline(lex_file);
 240         no_of_docs = no_newline(doc_file);
 241         word_occur = build_word_array(lex_file);
 242         doc_occur = build_doc_array(doc_file);
 243
 244         isInited = 1;
 245         /* output_docs(doc_occur, word_occur); */
 246 }
 247