/* This program generates documents of random lengths with random words */ /* using the frequencies of words and document lengths obtained from */ /* the Wall Street Journal(1985 - 1990) in order to create databases */ /* with similar stastitical properties to real text. Databases can be */ /* created in mg or atlas format. */ /* Author: D. Psaradellis, December 1994 */ /* Modified by J. Zobel, July 1995. */ /* * Copyright RMIT and The University of Melbourne. This code may not be * further distributed without the permission of J. Zobel. */ /* * Cleanup and nonsignificant changes by teodor */ #include #include #include #include #include "ftsbench.h" #define INITIAL_SEED 73802 /* initialise seed */ #define linelen 78 /* line length */ #define BUFLEN 1024 struct word_info { char *word; /* word */ int freq; /* word's frequency */ int cum_freq; /* cummulative freq - sum of previous * freq and cum_freq */ short word_len; /* length of word in bytes */ }; struct doc_info { int doc_len;/* document length */ int doc_freq; /* document frequency */ int doc_cfreq; /* cummulative frequency */ }; static struct word_info *a; static struct doc_info *d; static int no_of_words = 0; static int word_occur = 0; static int no_of_docs = 0; static int doc_occur = 0; static char buf [BUFLEN]; static int isInited = 0; /* This function calculates the no. of words/doclens in the file, */ /* by counting the no. of newlines */ static int no_newline(char *filename) { FILE *fp; int c , cnt = 0; if ((fp = fopen(filename, "r")) == NULL) fatal("Cannot open %s\n", filename); while ((c = getc(fp)) != EOF) if (c == '\n') cnt += 1; fclose(fp); return cnt; } /* This function dynamically allocates memory to store the words, their */ /* frequency, their cummulative frequency and their length in bytes */ static int build_word_array(char *filename) { FILE *fp; int i , temp_cfreq = 0; if ((fp = fopen(filename, "r")) == NULL) fatal("Cannot open %s\n", filename); a = (struct word_info *)malloc(no_of_words * sizeof(struct word_info)); if (!a) fatal("Can't allocate %d bytes\n", no_of_words * sizeof(struct word_info)); for (i = 0; i < no_of_words; i++) { fscanf(fp, "%s", buf); /* store word in temporary buffer */ a[i].word = strdup(buf); if (!a[i].word) fatal("strdup failed\n"); fscanf(fp, "%d", &a[i].freq); a[i].word_len = strlen(a[i].word); a[i].cum_freq = temp_cfreq; temp_cfreq += a[i].freq; } fclose(fp); return temp_cfreq; } /* This function dynamically allocates memory to store the document lengths, */ /* their frequency and their cummulative frequency */ static int build_doc_array(char *filename) { FILE *fp; int i , temp_cfreq = 0; if ((fp = fopen(filename, "r")) == NULL) fatal("Cannot open %s\n", filename); d = (struct doc_info *)malloc(no_of_docs * sizeof(struct doc_info)); if ( !d ) fatal("Can't allocate %d bytes\n", no_of_docs * sizeof(struct doc_info)); for (i = 0; i < no_of_docs; i++) { fscanf(fp, "%d", &d[i].doc_len); fscanf(fp, "%d", &d[i].doc_freq); d[i].doc_cfreq = temp_cfreq; temp_cfreq += d[i].doc_freq; } fclose(fp); return temp_cfreq; } /* to locate the index of the word required */ static int binsearch_word(int v) { int l = 0; int r = no_of_words; int x; x = (l + r) >> 1; while (r > l) { if (v < a[x].cum_freq) r = x - 1; else if (v >= a[x].cum_freq + a[x].freq) l = x + 1; else break; x = (l + r) >> 1; } return (x); } /* This function outputs a specified no. of random words taking into */ /* account the linelen */ static void output_words(StringBuf *b, int words_remain) { int index , linesize = 0; while (words_remain > 0) { /* generate random no. in range of */ index = binsearch_word(rnd() % word_occur); /* 0 - word occurrences */ if ((a[index].word_len + linesize) > linelen) { /* if len of line * exceeds */ sb_add(b, "\n", 1); linesize = 0; } sb_add(b, a[index].word, a[index].word_len); sb_add(b, " ", 1); linesize += (a[index].word_len + 1); words_remain--; } if (linesize > 0) sb_add(b, "\n", 1); } static char* get_words() { int index = binsearch_word(rnd() % word_occur); return a[index].word; } /* This function uses a binary search algorithm on the doc_cfreq field */ /* to locate the index of the document length required */ static int binsearch_doc(int v) { int l = 0; int r = no_of_docs; int x; x = (l + r) >> 2; while (r > l) { if (v < d[x].doc_cfreq) r = x - 1; else if (v >= d[x].doc_cfreq + d[x].doc_freq) l = x + 1; else break; x = (l + r) >> 1; } return (x); } /* This function uses a binary search algorithm on the cum_freq field */ void generate_doc(StringBuf *b) { int index; b->strlen = 0; index = binsearch_doc(rnd() % doc_occur); output_words( b, d[index].doc_len ); } char ** generate_querywords() { int index; char **res; int i; index = binsearch_doc(rnd() % doc_occur); res = (char**) malloc(sizeof(char*) * (d[index].doc_len+1)); for(i=0;i