Initial revision

author teodor <teodor>

Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)

committer teodor <teodor>

Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)
author teodor <teodor>
Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)
committer teodor <teodor>
Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)
diff --git a/Makefile b/Makefile

new file mode 100644 (file)

index 0000000..8d36bf4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,14 @@
+PG_CPPFLAGS = -DOPTIMIZE_WILDCARD_QUERY
+MODULE_big = wildspeed
+OBJS = wildspeed.o 
+
+DATA_built = wildspeed.sql
+DATA = uninstall_wildspeed.sql
+REGRESS = wildspeed
+
+
+subdir = contrib/wildspeed
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+include $(top_srcdir)/contrib/contrib-global.mk
diff --git a/README.wildspeed b/README.wildspeed

new file mode 100644 (file)

index 0000000..1bee021
--- /dev/null
+++ b/README.wildspeed
@@ -0,0 +1,158 @@
+WildSpeed - fast wildcard search for LIKE operator
+
+   Wildspeed extension provides GIN index support for wildcard search
+   for LIKE operator.
+
+   Online version of this document is available
+   http://www.sai.msu.su/~megera/wiki/wildspeed
+
+Authors
+
+     * Oleg Bartunov <oleg@sai.msu.su>, Moscow, Moscow University, Russia
+     * Teodor Sigaev <teodor@sigaev.ru>, Moscow, Moscow University,Russia
+
+License
+
+   Stable version, included into PostgreSQL distribution, released under
+   BSD license. Development version, available from this site, released
+   under the GNU General Public License, version 2 (June 1991)
+
+Downloads
+
+   Stable version of wildspeed is available from
+   http://www.sigaev.ru/cvsweb/cvsweb.cgi/wildspeed/
+
+Installation
+
+% cd PGSQLSRC/contrib
+% tar xzvf wildspeed.tar.gz
+% make
+% make install
+% make installcheck
+% psql DB < wildspeed.sql
+
+   Wildspeed provides opclass (wildcard_ops) and uses partial match
+   feature of GIN, available since 8.4. Also, it supports full index scan.
+
+   The size of index can be very big, since it contains entries for all
+   permutations of the original word, see [1] for details. For example,
+   word hello will be indexed as well as its all permutations:
+=# select permute('hello');
+               permute
+--------------------------------------
+ {hello$,ello$h,llo$he,lo$hel,o$hell}
+
+   Notice, symbol '$' is used only for visualization, in actual
+   implementation null-symbol '\0' is used.
+
+   Search query rewritten as prefix search:
+*X  -> X$*
+X*Y -> Y$X*
+*X* -> X*
+
+   For example, search for 'hel*o' will be rewritten as 'o$hel'.
+
+   Special function permute(TEXT), which returns all permutations of
+   argument, provided for test purposes.
+
+   Performance of wildspeed depends on search pattern. Basically,
+   wildspeed is less effective than btree index with text_pattern_ops for
+   prefix search (the difference is greatly reduced for long prefixes) and
+   much faster for wildcard search.
+
+   Wildspeed by default uses optimization (skip short patterns if there
+   are long one), which can be turned off in Makefile by removing define
+   -DOPTIMIZE_WILDCARD_QUERY.
+
+References
+
+   1.  http://www.cs.wright.edu/~tkprasad/courses/cs499/L05TolerantIR.ppt, see also,
+   http://nlp.stanford.edu/IR-book/html/htmledition/permuterm-indexes-1.html
+
+Examples
+
+   Table words contains 747358 records, w1 and w2 columns contains the
+   same data in order to test performance of Btree (w1) and GIN (w2)
+   indexes:
+   Table "public.words"
+ Column | Type | Modifiers
+--------+------+-----------
+ w1     | text |
+ w2     | text |
+
+words=# create index bt_idx on words using btree (w1 text_pattern_ops);
+CREATE INDEX
+Time: 1885.195 ms
+words=# create index gin_idx on words using gin (w2 wildcard_ops);
+vacuum analyze;
+CREATE INDEX
+Time: 530351.223 ms
+
+Size:
+
+words=# select pg_relation_size('words');
+ pg_relation_size
+------------------
+         43253760
+
+words=# select pg_relation_size('gin_idx');
+ pg_relation_size
+------------------
+        417816576
+(1 row)
+
+words=# select pg_relation_size('bt_idx');
+ pg_relation_size
+------------------
+         23437312
+(1 row)
+
+   Prefix search:
+words=# select count(*) from words where w1 like 'a%';
+ count
+-------
+ 15491
+(1 row)
+
+Time: 7.502 ms
+words=# select count(*) from words where w2 like 'a%';
+ count
+-------
+ 15491
+(1 row)
+
+Time: 31.152 ms
+
+   Wildcard search:
+words=# select count(*) from words where w1 like '%asd%';
+ count
+-------
+    26
+(1 row)
+
+Time: 147.308 ms
+words=# select count(*) from words where w2 like '%asd%';
+ count
+-------
+    26
+(1 row)
+
+Time: 0.339 ms
+
+   Full index scan:
+words=# set enable_seqscan to off;
+words=# explain analyze select count(*) from words where w2 like '%';
+                                                                QUERY PLAN
+
+--------------------------------------------------------------------------------------------------------------
+-----------------------------
+ Aggregate  (cost=226274.98..226274.99 rows=1 width=0) (actual time=2218.709..2218.709 rows=1 loops=1)
+   ->  Bitmap Heap Scan on words  (cost=209785.73..224406.77 rows=747283 width=0) (actual time=1510.516..1913.
+430 rows=747358 loops=1)
+         Filter: (w2 ~~ '%'::text)
+         ->  Bitmap Index Scan on gin_idx  (cost=0.00..209598.91 rows=747283 width=0) (actual time=1509.358..1
+509.358 rows=747358 loops=1)
+               Index Cond: (w2 ~~ '%'::text)
+ Total runtime: 2218.747 ms
+(6 rows)
+
diff --git a/data/testlike.data b/data/testlike.data

new file mode 100644 (file)

index 0000000..df8861e
--- /dev/null
+++ b/data/testlike.data
@@ -0,0 +1,9 @@
+hello
+hella
+hillo
+helko
+hekko
+jello
+jelko
+jella
+jeeeh
diff --git a/expected/wildspeed.out b/expected/wildspeed.out

new file mode 100644 (file)

index 0000000..b5a85fc
--- /dev/null
+++ b/expected/wildspeed.out
@@ -0,0 +1,346 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of wildspeed.sql.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SELECT permute('');
+ permute 
+---------
+ {$}
+(1 row)
+
+SELECT permute('0');
+ permute 
+---------
+ {0$}
+(1 row)
+
+SELECT permute('01');
+  permute  
+-----------
+ {01$,1$0}
+(1 row)
+
+SELECT permute('hello');
+               permute                
+--------------------------------------
+ {hello$,ello$h,llo$he,lo$hel,o$hell}
+(1 row)
+
+CREATE TABLE testlike ( t text );
+\copy testlike from 'data/testlike.data'
+INSERT INTO testlike VALUES ('');
+SELECT * FROM testlike WHERE t LIKE '' ORDER BY t;
+ t 
+---
+ 
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t;
+   t   
+-------
+ 
+ hekko
+ helko
+ hella
+ hello
+ hillo
+ jeeeh
+ jelko
+ jella
+ jello
+(10 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t;
+   t   
+-------
+ helko
+ hella
+ hello
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t;
+   t   
+-------
+ hello
+ hillo
+ jello
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t;
+   t   
+-------
+ hella
+ hello
+ hillo
+ jella
+ jello
+(5 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ hillo
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+ hillo
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+ hillo
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+(2 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t;
+   t   
+-------
+ helko
+ hella
+ hello
+ jelko
+ jella
+ jello
+(6 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ jelko
+ jello
+(5 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ hillo
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t;
+   t   
+-------
+ jeeeh
+ jelko
+ jella
+ jello
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t;
+   t   
+-------
+ jelko
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+(2 rows)
+
+CREATE INDEX like_idx ON testlike USING gin ( t wildcard_ops );
+SET enable_seqscan=off;
+SELECT * FROM testlike WHERE t LIKE '' ORDER BY t;
+ t 
+---
+ 
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t;
+   t   
+-------
+ 
+ hekko
+ helko
+ hella
+ hello
+ hillo
+ jeeeh
+ jelko
+ jella
+ jello
+(10 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t;
+   t   
+-------
+ helko
+ hella
+ hello
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t;
+   t   
+-------
+ hello
+ hillo
+ jello
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t;
+   t   
+-------
+ hello
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t;
+   t   
+-------
+ hella
+ hello
+ hillo
+ jella
+ jello
+(5 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ hillo
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+ hillo
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+ hillo
+(3 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t;
+   t   
+-------
+ helko
+ hello
+(2 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t;
+   t   
+-------
+ helko
+ hella
+ hello
+ jelko
+ jella
+ jello
+(6 rows)
+
+SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ jelko
+ jello
+(5 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+ hello
+ hillo
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t;
+   t   
+-------
+ jeeeh
+ jelko
+ jella
+ jello
+(4 rows)
+
+SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t;
+   t   
+-------
+ jelko
+(1 row)
+
+SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t;
+   t   
+-------
+ hekko
+ helko
+(2 rows)
+
diff --git a/sql/wildspeed.sql b/sql/wildspeed.sql

new file mode 100644 (file)

index 0000000..d536e69
--- /dev/null
+++ b/sql/wildspeed.sql
@@ -0,0 +1,67 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of wildspeed.sql.
+--
+
+SET client_min_messages = warning;
+\set ECHO none
+\i wildspeed.sql
+\set ECHO all
+RESET client_min_messages;
+
+
+SELECT permute('');
+SELECT permute('0');
+SELECT permute('01');
+SELECT permute('hello');
+
+CREATE TABLE testlike ( t text );
+\copy testlike from 'data/testlike.data'
+INSERT INTO testlike VALUES ('');
+
+SELECT * FROM testlike WHERE t LIKE '' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t;
+
+CREATE INDEX like_idx ON testlike USING gin ( t wildcard_ops );
+
+SET enable_seqscan=off;
+
+SELECT * FROM testlike WHERE t LIKE '' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t;
+SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t;
+
+
+
diff --git a/uninstall_wildspeed.sql b/uninstall_wildspeed.sql

new file mode 100644 (file)

index 0000000..978c142
--- /dev/null
+++ b/uninstall_wildspeed.sql
@@ -0,0 +1,10 @@
+BEGIN;
+
+DROP FUNCTION IF EXISTS permute(text) CASCADE;
+DROP OPERATOR CLASS IF EXISTS wildcard_ops USING gin CASCADE;
+DROP FUNCTION IF EXISTS gin_extract_permuted(text, internal)  CASCADE;
+DROP FUNCTION IF EXISTS wildcmp(text, text, bool) CASCADE;
+DROP FUNCTION IF EXISTS gin_extract_wildcard(text, internal, int2, internal) CASCADE;
+DROP FUNCTION IF EXISTS gin_consistent_wildcard(internal, int2, text) CASCADE;
+
+END;
diff --git a/wildspeed.c b/wildspeed.c

new file mode 100644 (file)

index 0000000..389dff8
--- /dev/null
+++ b/wildspeed.c
@@ -0,0 +1,651 @@
+#include "postgres.h"
+
+#include "catalog/pg_type.h"
+#include "mb/pg_wchar.h"
+#include "utils/array.h"
+
+/*
+ * MARK_SIGN is a sign of end of string and this character should
+ * be regular character. The best candidate of this is a zero byte
+ * which is accepted for any locale used by postgres. But it's impossible
+ * to show it, so we will replace it to another one (MARK_SIGN_SHOW) which 
+ * can be noticed well. But we can't use it as mark because it's allowed
+ * to be inside string.
+ */ 
+
+#define                MARK_SIGN               '\0'
+#define                MARK_SIGN_SHOW  '$'
+
+
+#define WC_BEGIN               0x01   /* should be in begining of string */
+#define WC_MIDDLE              0x02   /* should be in middle of string */
+#define WC_END                 0x04   /* should be in end of string */
+
+PG_MODULE_MAGIC;
+
+static text*
+appendStrToText( text *src, char *str, int32 len, int32 maxlen )
+{
+       int32   curlen;
+
+       if (src == NULL )
+       {
+               Assert( maxlen >= 0 );
+               src = (text*)palloc( VARHDRSZ + sizeof(char*) * maxlen );
+               SET_VARSIZE(src, 0 + VARHDRSZ);
+       }
+
+       curlen = VARSIZE(src) - VARHDRSZ;
+
+       if (len>0)
+               memcpy( VARDATA(src) + curlen, str, len );
+
+       SET_VARSIZE(src, curlen + len + VARHDRSZ);
+
+       return src;
+}
+
+static text*
+appendMarkToText( text *src, int32 maxlen )
+{
+       char sign = MARK_SIGN;
+
+       return appendStrToText( src, &sign, 1, maxlen );
+}
+
+static text*
+setFlagOfText( char flag, int32 maxlen )
+{
+       char flagstruct[2];
+
+       Assert( maxlen > 0 );
+       /*
+        * Mark text by setting first byte to MARK_SIGN to indicate
+        * that text has flags. It's a safe for non empty string, 
+        * because first character can not be a MARK_SIGN (see
+        * gin_extract_permuted() )
+        */
+
+       flagstruct[0] = MARK_SIGN;
+       flagstruct[1] = flag;
+       
+       return appendStrToText(NULL, flagstruct, 2, maxlen );
+}
+
+PG_FUNCTION_INFO_V1(gin_extract_permuted);
+Datum          gin_extract_permuted(PG_FUNCTION_ARGS);
+Datum
+gin_extract_permuted(PG_FUNCTION_ARGS)
+{
+       text    *src = PG_GETARG_TEXT_P(0);
+       int32   *nentries = (int32 *) PG_GETARG_POINTER(1);
+       Datum   *entries = NULL;
+       int32   srclen = pg_mbstrlen_with_len(VARDATA(src), VARSIZE(src) - VARHDRSZ);
+
+       *nentries = srclen;
+
+       if ( srclen == 0 )
+       {
+               /*
+                * Empty string is encoded by alone MARK_SIGN character
+                */
+               *nentries = 1;
+               entries = (Datum*) palloc(sizeof(Datum));
+               entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) );
+       }
+       else
+       {
+               text    *dst;
+               int32   i, 
+                               offset=0; /* offset to current position in src in bytes */ 
+               int32   nbytes = VARSIZE(src) - VARHDRSZ;
+               char    *srcptr = VARDATA(src);
+
+               /*
+                * Permutation: hello will be permuted to hello$, ello$h, llo$he, lo$hel, o$hell.
+                * So, number of entries is equial to number of characters (not a bytes)
+                */
+
+               entries = (Datum*)palloc(sizeof(char*) * nbytes );
+               for(i=0; i<srclen;i++) {
+               
+                       /*
+                        * Copy first part. For llo$he it will be 'llo'
+                        */
+                       dst = appendStrToText( NULL, srcptr + offset, nbytes - offset, nbytes + 1 ); 
+
+                       /*
+                        * Set mark sign ($)
+                        */
+                       dst = appendMarkToText( dst, -1 );
+
+                       /*
+                        * Copy rest of string (in example above 'he')
+                        */
+                       dst = appendStrToText( dst, srcptr, offset, -1 );
+
+                       entries[i] = PointerGetDatum(dst);
+                       offset += pg_mblen( srcptr + offset );
+               }
+       }
+
+       PG_FREE_IF_COPY(src,0);
+       PG_RETURN_POINTER(entries);
+}
+
+PG_FUNCTION_INFO_V1(wildcmp);
+Datum       wildcmp(PG_FUNCTION_ARGS);
+Datum
+wildcmp(PG_FUNCTION_ARGS)
+{
+       text    *a = PG_GETARG_TEXT_P(0);
+       text    *b = PG_GETARG_TEXT_P(1);
+       bool    partialMatch = PG_GETARG_BOOL(2);
+       int32   cmp;
+       int             lena,
+                       lenb;
+       char    *ptra = VARDATA(a),
+                       *ptrb = VARDATA(b);
+       char    flag = 0;
+
+       lena = VARSIZE(a) - VARHDRSZ;
+       lenb = VARSIZE(b) - VARHDRSZ;
+
+       /*
+        * sets correct pointers and lengths in case of flags
+        * presence
+        */
+       if ( lena > 2 && *ptra == MARK_SIGN )
+       {
+               flag = *(ptra+1);
+               ptra+=2;
+               lena-=2;
+
+               if ( lenb > 2 && *ptrb == MARK_SIGN )
+               {
+                       /*
+                        * If they have different flags then they can not be equal, this 
+                        * place works only during check of equality of keys
+                        * to search
+                        */
+                       if ( flag != *(ptrb+1) )
+                               return 1;
+                       ptrb+=2;
+                       lenb-=2;
+
+                       /* b can not be a product of gin_extract_wildcard for partial match mode */
+                       Assert( partialMatch == false );
+               }
+       } 
+       else  if ( lenb > 2 && *ptrb == MARK_SIGN )
+       {
+               /* b can not be a product of gin_extract_wildcard for partial match mode */
+               Assert( partialMatch == false );
+
+               ptrb+=2;
+               lenb-=2;
+       }
+
+       if ( lena == 0 )
+       {
+               if ( partialMatch )
+                       cmp = 0; /* full scan for partialMatch*/
+               else
+                       cmp = (lenb>0) ? -1 : 0;
+       }
+       else
+       {
+               /*
+                * We couldn't use strcmp because of MARK_SIGN
+                */
+               cmp = memcmp(ptra, ptrb, Min(lena, lenb));
+
+               if ( partialMatch )
+               {
+                       if ( cmp == 0 )
+                       {
+                               if ( lena > lenb )
+                               {
+                                       /*
+                                        * b argument is not beginning with argument a
+                                        */
+                                       cmp = 1;
+                               }
+                               else if ( flag > 0 && lenb>lena /* be safe */ )
+                               { /* there is some flags to check */
+                                       char    actualFlag;
+
+                                       if ( ptrb[ lenb - 1 ] == MARK_SIGN )
+                                               actualFlag = WC_BEGIN;  
+                                       else if ( ptrb[ lena ] == MARK_SIGN )
+                                               actualFlag = WC_END;
+                                       else
+                                               actualFlag = WC_MIDDLE;
+
+                                       if ( (flag & actualFlag) == 0 )
+                                       {
+                                               /* 
+                                                * Prefix are matched but this prefix s not placed as needed.
+                                                * so we should give a smoke signal to GIN that we don't want
+                                                * this match, but wish to continue scan 
+                                                */
+                                               cmp = -1;
+                                       }
+                               }
+                       } 
+                       else if (cmp < 0)
+                       {
+                               cmp = 1; /* prevent continue scan */
+                       }
+               } 
+               else if ( (cmp == 0) && (lena != lenb) )
+               {
+                       cmp = (lena < lenb) ? -1 : 1;
+               }
+       }
+
+       PG_FREE_IF_COPY(a,0);
+       PG_FREE_IF_COPY(b,1);
+       PG_RETURN_INT32( cmp ); 
+}
+
+#ifdef OPTIMIZE_WILDCARD_QUERY
+
+typedef struct 
+{
+       Datum   entry;
+       int32   len;
+       char    flag;
+} OptItem;
+
+
+/*
+ * Function drops most short search word to speedup 
+ * index search by preventing use word which gives
+ * a lot of matches
+ */
+static void 
+optimize_wildcard_search( Datum *entries, int32 *nentries )
+{
+       int32   maxlen=0;
+       OptItem *items;
+       int             i, nitems = *nentries;
+       char    *ptr,*p;
+
+       items = (OptItem*)palloc( sizeof(OptItem) * (*nentries) );
+       for(i=0;i<nitems;i++)
+       {
+               items[i].entry = entries[i];
+               items[i].len = VARSIZE(entries[i]) - VARHDRSZ;
+               ptr = VARDATA(entries[i]);
+
+               if ( items[i].len > 2 && *ptr == MARK_SIGN )
+               {
+                       items[i].len-=2;
+                       items[i].flag = *(ptr+1);
+               }
+               else
+               {
+                       items[i].flag = 0;
+                       if ( items[i].len > 1 && (p=strchr(ptr, MARK_SIGN)) != NULL )
+                       {
+                               if ( p == ptr + items[i].len -1 )
+                                       items[i].flag = WC_BEGIN;
+                               else 
+                                       items[i].flag = WC_BEGIN | WC_END;
+                       }
+               }
+
+               if ( items[i].len > maxlen )
+                       maxlen = items[i].len;
+       }
+       
+       *nentries=0;
+
+       for(i=0;i<nitems;i++)
+       {
+               if ( (items[i].flag & WC_BEGIN) && (items[i].flag & WC_END) )
+               {       /* X$Y use always */
+                       entries[ *nentries ] = items[i].entry;
+                       (*nentries)++;
+               }
+               else if ( (items[i].flag & WC_MIDDLE) == 0 )
+               { 
+                       /* 
+                        * for begin-only or end-only word we set more low limit than for 
+                        * other variants
+                        */
+                       if ( 3*items[i].len > maxlen )
+                       {
+                               entries[ *nentries ] = items[i].entry;
+                               (*nentries)++;
+                       }
+               }
+               else if ( 2*items[i].len > maxlen )
+               {       
+                       /* 
+                        * use only items with biggest length 
+                        */
+                       entries[ *nentries ] = items[i].entry;
+                       (*nentries)++;
+               }
+       }
+
+       Assert( *nentries>0 );
+
+}
+#endif
+
+typedef struct 
+{
+       bool    iswildcard;
+       int32   len;
+       char    *ptr;
+} WildItem;
+
+PG_FUNCTION_INFO_V1(gin_extract_wildcard);
+Datum          gin_extract_wildcard(PG_FUNCTION_ARGS);
+Datum
+gin_extract_wildcard(PG_FUNCTION_ARGS)
+{
+       text                    *q = PG_GETARG_TEXT_P(0);
+       int32                   lenq = VARSIZE(q) - VARHDRSZ;
+       int32                   *nentries = (int32 *) PG_GETARG_POINTER(1);
+#ifdef NOT_USED
+       StrategyNumber  strategy = PG_GETARG_UINT16(2);
+#endif
+       bool                    *partialmatch, 
+                                       **ptr_partialmatch = (bool**) PG_GETARG_POINTER(3);
+       Datum                   *entries = NULL;
+       char                    *qptr = VARDATA(q);
+       int                             clen,
+                                       splitqlen = 0,
+                                       i;
+       WildItem                *items;
+       text                    *entry;
+
+       *nentries = 0;
+
+       if ( lenq == 0 )
+       {
+               partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool));
+               *nentries = 1;
+               entries = (Datum*) palloc(sizeof(Datum));
+               entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) );
+
+               PG_RETURN_POINTER(entries);
+       }
+
+       partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool) * lenq);
+       entries = (Datum*) palloc(sizeof(Datum) * lenq);
+       items=(WildItem*) palloc0( sizeof(WildItem) * lenq );
+
+
+       /*
+        * Parse expression to the list of constant parts and
+        * wildcards
+        */
+       while( qptr - VARDATA(q) < lenq )
+       {
+               clen = pg_mblen(qptr);
+
+               if ( clen==1 && (*qptr == '_' || *qptr == '%' ) )
+               {
+                       if ( splitqlen == 0 )
+                       {
+                               items[ splitqlen ].iswildcard = true;
+                               splitqlen++;
+                       } 
+                       else if ( items[ splitqlen-1 ].iswildcard == false )
+                       {
+                               items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr;
+                               items[ splitqlen ].iswildcard = true;
+                               splitqlen++;
+                       }
+                       /*
+                        * ignore wildcard, because we don't make difference beetween
+                        * %, _ or a combination of its
+                        */
+               }
+               else
+               {
+                       if ( splitqlen == 0 || items[ splitqlen-1 ].iswildcard == true )
+                       {
+                               items[ splitqlen ].ptr = qptr;
+                               splitqlen++;
+                       }
+               }
+               qptr += clen;
+       }
+
+       Assert( splitqlen >= 1 );
+       if ( items[ splitqlen-1 ].iswildcard == false )
+               items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr;
+
+       if ( items[ 0 ].iswildcard == false )
+       {
+               /* X... */
+               if ( splitqlen == 1 )
+               {
+                       /*   X => X$, exact match */
+                       *nentries = 1;
+                       entry = appendStrToText(NULL, items[ 0 ].ptr, items[ 0 ].len, lenq+1);
+                       entry = appendMarkToText( entry, -1 );
+                       entries[0] = PointerGetDatum( entry );
+               } 
+               else if ( items[ splitqlen-1 ].iswildcard == false ) 
+               {
+                       /*   X * [X1 * [] ] ] Y => Y$X* [ + X1* [] ] */
+
+                       *nentries = 1;
+                       entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1);
+                       entry = appendMarkToText( entry, -1 );
+                       entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1);
+                       partialmatch[0] = true;
+                       entries[0] = PointerGetDatum( entry );
+
+                       for(i=1; i<splitqlen-1; i++)
+                       {
+                               if ( items[ i ].iswildcard )
+                                       continue;
+                               entry = setFlagOfText( WC_MIDDLE, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); 
+                               entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, -1 );
+                               partialmatch[ *nentries ] = true;
+                               entries[ *nentries ] =  PointerGetDatum( entry );
+                               (*nentries)++;
+                       }
+               }
+               else
+               {
+                       /*   X * [ X1 * [] ]  => X*$ [ + X1* [] ] */
+               
+                       entry = setFlagOfText( WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ );
+                       entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1);
+                       *nentries = 1;
+                       partialmatch[ 0 ] = true;
+                       entries[0] = PointerGetDatum( entry );
+
+                       for(i=2; i<splitqlen-1; i++)
+                       {
+                               if ( items[ i ].iswildcard )
+                                       continue;
+                               entry = setFlagOfText( (i==splitqlen-2) ? (WC_MIDDLE | WC_END) : WC_MIDDLE, 
+                                                                               lenq + 1 /* MARK_SIGN */ + 2 /* flag */ );
+                               entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, -1);
+                               partialmatch[ *nentries ] = true;
+                               entries[ *nentries ] =  PointerGetDatum( entry );
+                               (*nentries)++;
+                       }
+               }
+       } 
+       else
+       {
+               /* *...  */
+
+               if ( splitqlen == 1 )
+               {
+                       /* any word => full scan */
+                       *nentries = 1;
+                       entry = appendStrToText(NULL, "", 0, lenq+1);
+                       partialmatch[0] = true;
+                       entries[0] = PointerGetDatum( entry );
+               }
+               else if ( items[ splitqlen-1 ].iswildcard == false )
+               {
+                       /*     * [ X1 * [] ] X  => X$* [ + X1* [] ]  */
+                       *nentries = 1;
+                       entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1);
+                       entry = appendMarkToText( entry, -1 );
+                       partialmatch[0] = true;
+                       entries[0] = PointerGetDatum( entry );
+
+                       for(i=1; i<splitqlen-1; i++)
+                       {
+                               if ( items[ i ].iswildcard )
+                                       continue;
+                               entry = setFlagOfText( (i==1) ? (WC_MIDDLE | WC_BEGIN) : WC_MIDDLE, 
+                                                                               lenq + 1 /* MARK_SIGN */ + 2 /* flag */ );
+                               entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, -1);
+                               partialmatch[ *nentries ] = true;
+                               entries[ *nentries ] =  PointerGetDatum( entry );
+                               (*nentries)++;
+                       }
+               }
+               else
+               {
+                       /* * X [ * X1 [] ] * => X* [ + X1* [] ] */
+                       for(i=1; i<splitqlen-1; i++)
+                       {
+                               if ( items[ i ].iswildcard )
+                                       continue;
+
+                               if ( splitqlen > 3 )
+                               {
+                                       if ( i==1 )
+                                               entry = setFlagOfText( WC_MIDDLE | WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ );
+                                       else if ( i == splitqlen-2 )
+                                               entry = setFlagOfText( WC_MIDDLE | WC_END, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ );
+                                       else
+                                               entry = setFlagOfText( WC_MIDDLE, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); 
+                               }
+                               else
+                                       entry = NULL;
+                               entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, lenq+1);
+                               partialmatch[ *nentries ] = true;
+                               entries[ *nentries ] =  PointerGetDatum( entry );
+                               (*nentries)++;
+                       }
+               }
+       }
+
+       PG_FREE_IF_COPY(q,0);
+
+#ifdef OPTIMIZE_WILDCARD_QUERY
+       if ( *nentries > 1 )
+               optimize_wildcard_search( entries, nentries );
+#endif
+
+       PG_RETURN_POINTER(entries);
+}
+
+
+PG_FUNCTION_INFO_V1(gin_consistent_wildcard);
+Datum       gin_consistent_wildcard(PG_FUNCTION_ARGS);
+Datum
+gin_consistent_wildcard(PG_FUNCTION_ARGS)
+{
+       bool            *check = (bool *) PG_GETARG_POINTER(0);
+       bool        res = true;
+       int         i;
+       int32           nentries;
+
+       if ( fcinfo->flinfo->fn_extra == NULL )
+       {
+               bool    *pmatch;
+
+               /*
+                * we need to get nentries, we'll get it by regular way
+                * and store it in function context
+                */
+
+               fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+                                                                                                               sizeof(int32));
+
+               DirectFunctionCall4(
+                                       gin_extract_wildcard,
+                                       PG_GETARG_DATUM(2),  /* query */
+                                       PointerGetDatum( fcinfo->flinfo->fn_extra ), /* &nentries */
+                                       PG_GETARG_DATUM(1),  /* strategy */
+                                       PointerGetDatum( &pmatch )
+               );
+       }
+
+       nentries = *(int32*) fcinfo->flinfo->fn_extra;
+
+       for (i = 0; res && i < nentries; i++)
+               if (check[i] == false)
+                       res = false;
+
+       PG_RETURN_BOOL(res);
+}
+
+/*
+ * Mostly debug fuction
+ */
+PG_FUNCTION_INFO_V1(permute);
+Datum       permute(PG_FUNCTION_ARGS);
+Datum
+permute(PG_FUNCTION_ARGS)
+{
+       Datum           src = PG_GETARG_DATUM(0);
+       int32           nentries = 0;
+       Datum           *entries;
+       ArrayType       *res;
+       int             i;
+
+       /*
+        * Get permuted values by gin_extract_permuted()
+        */
+       entries = (Datum*) DatumGetPointer(DirectFunctionCall2(
+                                       gin_extract_permuted, src, PointerGetDatum(&nentries)
+                       ));
+
+       /*
+        * We need to replace MARK_SIGN to MARK_SIGN_SHOW.
+        * See comments above near definition of MARK_SIGN and MARK_SIGN_SHOW.
+        */
+       if ( nentries == 1 && VARSIZE(entries[0]) == VARHDRSZ + 1)
+       {
+               *(VARDATA(entries[0])) = MARK_SIGN_SHOW;                
+       }
+       else
+       {
+               int32   offset = 0; /* offset of MARK_SIGN */
+               char    *ptr;
+
+               /*
+                * We scan array from the end because it allows simple calculation
+                * of MARK_SIGN position: on every iteration it's moved one 
+                * character to the end.
+                */
+               for(i=nentries-1;i>=0;i--) 
+               {
+                       ptr = VARDATA(entries[i]);
+
+                       offset += pg_mblen(ptr);
+                       Assert( *(ptr + offset) == MARK_SIGN );
+                       *(ptr + offset) = MARK_SIGN_SHOW;
+               }
+       }
+
+       res = construct_array(
+                                       entries,
+                                       nentries,
+                                       TEXTOID,
+                                       -1,
+                                       false,
+                                       'i'
+                       );
+
+       PG_RETURN_POINTER(res);
+}
diff --git a/wildspeed.sql.in b/wildspeed.sql.in

new file mode 100644 (file)

index 0000000..a8c5cdb
--- /dev/null
+++ b/wildspeed.sql.in
@@ -0,0 +1,41 @@
+BEGIN;
+
+-- support functions for gin
+CREATE OR REPLACE FUNCTION gin_extract_permuted(text, internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION wildcmp(text, text, bool)
+RETURNS int32
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION gin_extract_wildcard(text, internal, int2, internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION gin_consistent_wildcard(internal, int2, text)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE OPERATOR CLASS wildcard_ops
+FOR TYPE text USING gin
+AS
+    OPERATOR        1       ~~ RECHECK,
+       FUNCTION        1       wildcmp(text,text,bool),
+       FUNCTION        2       gin_extract_permuted(text, internal),
+       FUNCTION        3       gin_extract_wildcard(text, internal, int2, internal),
+       FUNCTION        4       gin_consistent_wildcard(internal, int2, text),
+STORAGE         text;
+
+
+--debug function
+CREATE OR REPLACE FUNCTION permute(text)
+RETURNS _text
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE;
+
+COMMIT;
author	teodor <teodor>
	Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)
committer	teodor <teodor>
	Fri, 4 Apr 2008 16:02:14 +0000 (16:02 +0000)
Makefile	[new file with mode: 0644]	patch \| blob
README.wildspeed	[new file with mode: 0644]	patch \| blob
data/testlike.data	[new file with mode: 0644]	patch \| blob
expected/wildspeed.out	[new file with mode: 0644]	patch \| blob
sql/wildspeed.sql	[new file with mode: 0644]	patch \| blob
uninstall_wildspeed.sql	[new file with mode: 0644]	patch \| blob
wildspeed.c	[new file with mode: 0644]	patch \| blob
wildspeed.sql.in	[new file with mode: 0644]	patch \| blob