From: teodor Date: Fri, 4 Apr 2008 16:02:14 +0000 (+0000) Subject: Initial revision X-Git-Tag: start~1 X-Git-Url: http://sigaev.ru/git/gitweb.cgi?p=wildspeed.git;a=commitdiff_plain;h=ceecc7a42902dd4f7073c15d825df252817ba154 Initial revision --- ceecc7a42902dd4f7073c15d825df252817ba154 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8d36bf4 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +PG_CPPFLAGS = -DOPTIMIZE_WILDCARD_QUERY +MODULE_big = wildspeed +OBJS = wildspeed.o + +DATA_built = wildspeed.sql +DATA = uninstall_wildspeed.sql +REGRESS = wildspeed + + +subdir = contrib/wildspeed +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +include $(top_srcdir)/contrib/contrib-global.mk diff --git a/README.wildspeed b/README.wildspeed new file mode 100644 index 0000000..1bee021 --- /dev/null +++ b/README.wildspeed @@ -0,0 +1,158 @@ +WildSpeed - fast wildcard search for LIKE operator + + Wildspeed extension provides GIN index support for wildcard search + for LIKE operator. + + Online version of this document is available + http://www.sai.msu.su/~megera/wiki/wildspeed + +Authors + + * Oleg Bartunov , Moscow, Moscow University, Russia + * Teodor Sigaev , Moscow, Moscow University,Russia + +License + + Stable version, included into PostgreSQL distribution, released under + BSD license. Development version, available from this site, released + under the GNU General Public License, version 2 (June 1991) + +Downloads + + Stable version of wildspeed is available from + http://www.sigaev.ru/cvsweb/cvsweb.cgi/wildspeed/ + +Installation + +% cd PGSQLSRC/contrib +% tar xzvf wildspeed.tar.gz +% make +% make install +% make installcheck +% psql DB < wildspeed.sql + + Wildspeed provides opclass (wildcard_ops) and uses partial match + feature of GIN, available since 8.4. Also, it supports full index scan. + + The size of index can be very big, since it contains entries for all + permutations of the original word, see [1] for details. For example, + word hello will be indexed as well as its all permutations: +=# select permute('hello'); + permute +-------------------------------------- + {hello$,ello$h,llo$he,lo$hel,o$hell} + + Notice, symbol '$' is used only for visualization, in actual + implementation null-symbol '\0' is used. + + Search query rewritten as prefix search: +*X -> X$* +X*Y -> Y$X* +*X* -> X* + + For example, search for 'hel*o' will be rewritten as 'o$hel'. + + Special function permute(TEXT), which returns all permutations of + argument, provided for test purposes. + + Performance of wildspeed depends on search pattern. Basically, + wildspeed is less effective than btree index with text_pattern_ops for + prefix search (the difference is greatly reduced for long prefixes) and + much faster for wildcard search. + + Wildspeed by default uses optimization (skip short patterns if there + are long one), which can be turned off in Makefile by removing define + -DOPTIMIZE_WILDCARD_QUERY. + +References + + 1. http://www.cs.wright.edu/~tkprasad/courses/cs499/L05TolerantIR.ppt, see also, + http://nlp.stanford.edu/IR-book/html/htmledition/permuterm-indexes-1.html + +Examples + + Table words contains 747358 records, w1 and w2 columns contains the + same data in order to test performance of Btree (w1) and GIN (w2) + indexes: + Table "public.words" + Column | Type | Modifiers +--------+------+----------- + w1 | text | + w2 | text | + +words=# create index bt_idx on words using btree (w1 text_pattern_ops); +CREATE INDEX +Time: 1885.195 ms +words=# create index gin_idx on words using gin (w2 wildcard_ops); +vacuum analyze; +CREATE INDEX +Time: 530351.223 ms + +Size: + +words=# select pg_relation_size('words'); + pg_relation_size +------------------ + 43253760 + +words=# select pg_relation_size('gin_idx'); + pg_relation_size +------------------ + 417816576 +(1 row) + +words=# select pg_relation_size('bt_idx'); + pg_relation_size +------------------ + 23437312 +(1 row) + + Prefix search: +words=# select count(*) from words where w1 like 'a%'; + count +------- + 15491 +(1 row) + +Time: 7.502 ms +words=# select count(*) from words where w2 like 'a%'; + count +------- + 15491 +(1 row) + +Time: 31.152 ms + + Wildcard search: +words=# select count(*) from words where w1 like '%asd%'; + count +------- + 26 +(1 row) + +Time: 147.308 ms +words=# select count(*) from words where w2 like '%asd%'; + count +------- + 26 +(1 row) + +Time: 0.339 ms + + Full index scan: +words=# set enable_seqscan to off; +words=# explain analyze select count(*) from words where w2 like '%'; + QUERY PLAN + +-------------------------------------------------------------------------------------------------------------- +----------------------------- + Aggregate (cost=226274.98..226274.99 rows=1 width=0) (actual time=2218.709..2218.709 rows=1 loops=1) + -> Bitmap Heap Scan on words (cost=209785.73..224406.77 rows=747283 width=0) (actual time=1510.516..1913. +430 rows=747358 loops=1) + Filter: (w2 ~~ '%'::text) + -> Bitmap Index Scan on gin_idx (cost=0.00..209598.91 rows=747283 width=0) (actual time=1509.358..1 +509.358 rows=747358 loops=1) + Index Cond: (w2 ~~ '%'::text) + Total runtime: 2218.747 ms +(6 rows) + diff --git a/data/testlike.data b/data/testlike.data new file mode 100644 index 0000000..df8861e --- /dev/null +++ b/data/testlike.data @@ -0,0 +1,9 @@ +hello +hella +hillo +helko +hekko +jello +jelko +jella +jeeeh diff --git a/expected/wildspeed.out b/expected/wildspeed.out new file mode 100644 index 0000000..b5a85fc --- /dev/null +++ b/expected/wildspeed.out @@ -0,0 +1,346 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of wildspeed.sql. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +SELECT permute(''); + permute +--------- + {$} +(1 row) + +SELECT permute('0'); + permute +--------- + {0$} +(1 row) + +SELECT permute('01'); + permute +----------- + {01$,1$0} +(1 row) + +SELECT permute('hello'); + permute +-------------------------------------- + {hello$,ello$h,llo$he,lo$hel,o$hell} +(1 row) + +CREATE TABLE testlike ( t text ); +\copy testlike from 'data/testlike.data' +INSERT INTO testlike VALUES (''); +SELECT * FROM testlike WHERE t LIKE '' ORDER BY t; + t +--- + +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t; + t +------- + + hekko + helko + hella + hello + hillo + jeeeh + jelko + jella + jello +(10 rows) + +SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t; + t +------- + helko + hella + hello +(3 rows) + +SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t; + t +------- + hello + hillo + jello +(3 rows) + +SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t; + t +------- + hella + hello + hillo + jella + jello +(5 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t; + t +------- + hekko + helko + hello + hillo +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; + t +------- + helko + hello + hillo +(3 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; + t +------- + helko + hello + hillo +(3 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t; + t +------- + helko + hello +(2 rows) + +SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t; + t +------- + helko + hella + hello + jelko + jella + jello +(6 rows) + +SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t; + t +------- + hekko + helko + hello + jelko + jello +(5 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t; + t +------- + hekko + helko + hello + hillo +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t; + t +------- + jeeeh + jelko + jella + jello +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t; + t +------- + jelko +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t; + t +------- + hekko + helko +(2 rows) + +CREATE INDEX like_idx ON testlike USING gin ( t wildcard_ops ); +SET enable_seqscan=off; +SELECT * FROM testlike WHERE t LIKE '' ORDER BY t; + t +--- + +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t; + t +------- + + hekko + helko + hella + hello + hillo + jeeeh + jelko + jella + jello +(10 rows) + +SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t; + t +------- + helko + hella + hello +(3 rows) + +SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t; + t +------- + hello + hillo + jello +(3 rows) + +SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t; + t +------- + hello +(1 row) + +SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t; + t +------- + hella + hello + hillo + jella + jello +(5 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t; + t +------- + hekko + helko + hello + hillo +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; + t +------- + helko + hello + hillo +(3 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; + t +------- + helko + hello + hillo +(3 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t; + t +------- + helko + hello +(2 rows) + +SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t; + t +------- + helko + hella + hello + jelko + jella + jello +(6 rows) + +SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t; + t +------- + hekko + helko + hello + jelko + jello +(5 rows) + +SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t; + t +------- + hekko + helko + hello + hillo +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t; + t +------- + jeeeh + jelko + jella + jello +(4 rows) + +SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t; + t +------- + jelko +(1 row) + +SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t; + t +------- + hekko + helko +(2 rows) + diff --git a/sql/wildspeed.sql b/sql/wildspeed.sql new file mode 100644 index 0000000..d536e69 --- /dev/null +++ b/sql/wildspeed.sql @@ -0,0 +1,67 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of wildspeed.sql. +-- + +SET client_min_messages = warning; +\set ECHO none +\i wildspeed.sql +\set ECHO all +RESET client_min_messages; + + +SELECT permute(''); +SELECT permute('0'); +SELECT permute('01'); +SELECT permute('hello'); + +CREATE TABLE testlike ( t text ); +\copy testlike from 'data/testlike.data' +INSERT INTO testlike VALUES (''); + +SELECT * FROM testlike WHERE t LIKE '' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t; + +CREATE INDEX like_idx ON testlike USING gin ( t wildcard_ops ); + +SET enable_seqscan=off; + +SELECT * FROM testlike WHERE t LIKE '' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hello' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hello%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'hel%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%hello' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%lo' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%hello%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%ll%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%e%l%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%e%l%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE '%e%o' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%o%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'j%e%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'j%k%' ORDER BY t; +SELECT * FROM testlike WHERE t LIKE 'h%k%' ORDER BY t; + + + diff --git a/uninstall_wildspeed.sql b/uninstall_wildspeed.sql new file mode 100644 index 0000000..978c142 --- /dev/null +++ b/uninstall_wildspeed.sql @@ -0,0 +1,10 @@ +BEGIN; + +DROP FUNCTION IF EXISTS permute(text) CASCADE; +DROP OPERATOR CLASS IF EXISTS wildcard_ops USING gin CASCADE; +DROP FUNCTION IF EXISTS gin_extract_permuted(text, internal) CASCADE; +DROP FUNCTION IF EXISTS wildcmp(text, text, bool) CASCADE; +DROP FUNCTION IF EXISTS gin_extract_wildcard(text, internal, int2, internal) CASCADE; +DROP FUNCTION IF EXISTS gin_consistent_wildcard(internal, int2, text) CASCADE; + +END; diff --git a/wildspeed.c b/wildspeed.c new file mode 100644 index 0000000..389dff8 --- /dev/null +++ b/wildspeed.c @@ -0,0 +1,651 @@ +#include "postgres.h" + +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" +#include "utils/array.h" + +/* + * MARK_SIGN is a sign of end of string and this character should + * be regular character. The best candidate of this is a zero byte + * which is accepted for any locale used by postgres. But it's impossible + * to show it, so we will replace it to another one (MARK_SIGN_SHOW) which + * can be noticed well. But we can't use it as mark because it's allowed + * to be inside string. + */ + +#define MARK_SIGN '\0' +#define MARK_SIGN_SHOW '$' + + +#define WC_BEGIN 0x01 /* should be in begining of string */ +#define WC_MIDDLE 0x02 /* should be in middle of string */ +#define WC_END 0x04 /* should be in end of string */ + +PG_MODULE_MAGIC; + +static text* +appendStrToText( text *src, char *str, int32 len, int32 maxlen ) +{ + int32 curlen; + + if (src == NULL ) + { + Assert( maxlen >= 0 ); + src = (text*)palloc( VARHDRSZ + sizeof(char*) * maxlen ); + SET_VARSIZE(src, 0 + VARHDRSZ); + } + + curlen = VARSIZE(src) - VARHDRSZ; + + if (len>0) + memcpy( VARDATA(src) + curlen, str, len ); + + SET_VARSIZE(src, curlen + len + VARHDRSZ); + + return src; +} + +static text* +appendMarkToText( text *src, int32 maxlen ) +{ + char sign = MARK_SIGN; + + return appendStrToText( src, &sign, 1, maxlen ); +} + +static text* +setFlagOfText( char flag, int32 maxlen ) +{ + char flagstruct[2]; + + Assert( maxlen > 0 ); + /* + * Mark text by setting first byte to MARK_SIGN to indicate + * that text has flags. It's a safe for non empty string, + * because first character can not be a MARK_SIGN (see + * gin_extract_permuted() ) + */ + + flagstruct[0] = MARK_SIGN; + flagstruct[1] = flag; + + return appendStrToText(NULL, flagstruct, 2, maxlen ); +} + +PG_FUNCTION_INFO_V1(gin_extract_permuted); +Datum gin_extract_permuted(PG_FUNCTION_ARGS); +Datum +gin_extract_permuted(PG_FUNCTION_ARGS) +{ + text *src = PG_GETARG_TEXT_P(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + Datum *entries = NULL; + int32 srclen = pg_mbstrlen_with_len(VARDATA(src), VARSIZE(src) - VARHDRSZ); + + *nentries = srclen; + + if ( srclen == 0 ) + { + /* + * Empty string is encoded by alone MARK_SIGN character + */ + *nentries = 1; + entries = (Datum*) palloc(sizeof(Datum)); + entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) ); + } + else + { + text *dst; + int32 i, + offset=0; /* offset to current position in src in bytes */ + int32 nbytes = VARSIZE(src) - VARHDRSZ; + char *srcptr = VARDATA(src); + + /* + * Permutation: hello will be permuted to hello$, ello$h, llo$he, lo$hel, o$hell. + * So, number of entries is equial to number of characters (not a bytes) + */ + + entries = (Datum*)palloc(sizeof(char*) * nbytes ); + for(i=0; i 2 && *ptra == MARK_SIGN ) + { + flag = *(ptra+1); + ptra+=2; + lena-=2; + + if ( lenb > 2 && *ptrb == MARK_SIGN ) + { + /* + * If they have different flags then they can not be equal, this + * place works only during check of equality of keys + * to search + */ + if ( flag != *(ptrb+1) ) + return 1; + ptrb+=2; + lenb-=2; + + /* b can not be a product of gin_extract_wildcard for partial match mode */ + Assert( partialMatch == false ); + } + } + else if ( lenb > 2 && *ptrb == MARK_SIGN ) + { + /* b can not be a product of gin_extract_wildcard for partial match mode */ + Assert( partialMatch == false ); + + ptrb+=2; + lenb-=2; + } + + if ( lena == 0 ) + { + if ( partialMatch ) + cmp = 0; /* full scan for partialMatch*/ + else + cmp = (lenb>0) ? -1 : 0; + } + else + { + /* + * We couldn't use strcmp because of MARK_SIGN + */ + cmp = memcmp(ptra, ptrb, Min(lena, lenb)); + + if ( partialMatch ) + { + if ( cmp == 0 ) + { + if ( lena > lenb ) + { + /* + * b argument is not beginning with argument a + */ + cmp = 1; + } + else if ( flag > 0 && lenb>lena /* be safe */ ) + { /* there is some flags to check */ + char actualFlag; + + if ( ptrb[ lenb - 1 ] == MARK_SIGN ) + actualFlag = WC_BEGIN; + else if ( ptrb[ lena ] == MARK_SIGN ) + actualFlag = WC_END; + else + actualFlag = WC_MIDDLE; + + if ( (flag & actualFlag) == 0 ) + { + /* + * Prefix are matched but this prefix s not placed as needed. + * so we should give a smoke signal to GIN that we don't want + * this match, but wish to continue scan + */ + cmp = -1; + } + } + } + else if (cmp < 0) + { + cmp = 1; /* prevent continue scan */ + } + } + else if ( (cmp == 0) && (lena != lenb) ) + { + cmp = (lena < lenb) ? -1 : 1; + } + } + + PG_FREE_IF_COPY(a,0); + PG_FREE_IF_COPY(b,1); + PG_RETURN_INT32( cmp ); +} + +#ifdef OPTIMIZE_WILDCARD_QUERY + +typedef struct +{ + Datum entry; + int32 len; + char flag; +} OptItem; + + +/* + * Function drops most short search word to speedup + * index search by preventing use word which gives + * a lot of matches + */ +static void +optimize_wildcard_search( Datum *entries, int32 *nentries ) +{ + int32 maxlen=0; + OptItem *items; + int i, nitems = *nentries; + char *ptr,*p; + + items = (OptItem*)palloc( sizeof(OptItem) * (*nentries) ); + for(i=0;i 2 && *ptr == MARK_SIGN ) + { + items[i].len-=2; + items[i].flag = *(ptr+1); + } + else + { + items[i].flag = 0; + if ( items[i].len > 1 && (p=strchr(ptr, MARK_SIGN)) != NULL ) + { + if ( p == ptr + items[i].len -1 ) + items[i].flag = WC_BEGIN; + else + items[i].flag = WC_BEGIN | WC_END; + } + } + + if ( items[i].len > maxlen ) + maxlen = items[i].len; + } + + *nentries=0; + + for(i=0;i maxlen ) + { + entries[ *nentries ] = items[i].entry; + (*nentries)++; + } + } + else if ( 2*items[i].len > maxlen ) + { + /* + * use only items with biggest length + */ + entries[ *nentries ] = items[i].entry; + (*nentries)++; + } + } + + Assert( *nentries>0 ); + +} +#endif + +typedef struct +{ + bool iswildcard; + int32 len; + char *ptr; +} WildItem; + +PG_FUNCTION_INFO_V1(gin_extract_wildcard); +Datum gin_extract_wildcard(PG_FUNCTION_ARGS); +Datum +gin_extract_wildcard(PG_FUNCTION_ARGS) +{ + text *q = PG_GETARG_TEXT_P(0); + int32 lenq = VARSIZE(q) - VARHDRSZ; + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); +#ifdef NOT_USED + StrategyNumber strategy = PG_GETARG_UINT16(2); +#endif + bool *partialmatch, + **ptr_partialmatch = (bool**) PG_GETARG_POINTER(3); + Datum *entries = NULL; + char *qptr = VARDATA(q); + int clen, + splitqlen = 0, + i; + WildItem *items; + text *entry; + + *nentries = 0; + + if ( lenq == 0 ) + { + partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool)); + *nentries = 1; + entries = (Datum*) palloc(sizeof(Datum)); + entries[0] = PointerGetDatum( appendMarkToText( NULL, 1 ) ); + + PG_RETURN_POINTER(entries); + } + + partialmatch = *ptr_partialmatch = (bool*)palloc0(sizeof(bool) * lenq); + entries = (Datum*) palloc(sizeof(Datum) * lenq); + items=(WildItem*) palloc0( sizeof(WildItem) * lenq ); + + + /* + * Parse expression to the list of constant parts and + * wildcards + */ + while( qptr - VARDATA(q) < lenq ) + { + clen = pg_mblen(qptr); + + if ( clen==1 && (*qptr == '_' || *qptr == '%' ) ) + { + if ( splitqlen == 0 ) + { + items[ splitqlen ].iswildcard = true; + splitqlen++; + } + else if ( items[ splitqlen-1 ].iswildcard == false ) + { + items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr; + items[ splitqlen ].iswildcard = true; + splitqlen++; + } + /* + * ignore wildcard, because we don't make difference beetween + * %, _ or a combination of its + */ + } + else + { + if ( splitqlen == 0 || items[ splitqlen-1 ].iswildcard == true ) + { + items[ splitqlen ].ptr = qptr; + splitqlen++; + } + } + qptr += clen; + } + + Assert( splitqlen >= 1 ); + if ( items[ splitqlen-1 ].iswildcard == false ) + items[ splitqlen-1 ].len = qptr - items[ splitqlen-1 ].ptr; + + if ( items[ 0 ].iswildcard == false ) + { + /* X... */ + if ( splitqlen == 1 ) + { + /* X => X$, exact match */ + *nentries = 1; + entry = appendStrToText(NULL, items[ 0 ].ptr, items[ 0 ].len, lenq+1); + entry = appendMarkToText( entry, -1 ); + entries[0] = PointerGetDatum( entry ); + } + else if ( items[ splitqlen-1 ].iswildcard == false ) + { + /* X * [X1 * [] ] ] Y => Y$X* [ + X1* [] ] */ + + *nentries = 1; + entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1); + entry = appendMarkToText( entry, -1 ); + entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1); + partialmatch[0] = true; + entries[0] = PointerGetDatum( entry ); + + for(i=1; i X*$ [ + X1* [] ] */ + + entry = setFlagOfText( WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); + entry = appendStrToText(entry, items[ 0 ].ptr, items[ 0 ].len, -1); + *nentries = 1; + partialmatch[ 0 ] = true; + entries[0] = PointerGetDatum( entry ); + + for(i=2; i full scan */ + *nentries = 1; + entry = appendStrToText(NULL, "", 0, lenq+1); + partialmatch[0] = true; + entries[0] = PointerGetDatum( entry ); + } + else if ( items[ splitqlen-1 ].iswildcard == false ) + { + /* * [ X1 * [] ] X => X$* [ + X1* [] ] */ + *nentries = 1; + entry = appendStrToText(NULL, items[ splitqlen-1 ].ptr, items[ splitqlen-1 ].len, lenq+1); + entry = appendMarkToText( entry, -1 ); + partialmatch[0] = true; + entries[0] = PointerGetDatum( entry ); + + for(i=1; i X* [ + X1* [] ] */ + for(i=1; i 3 ) + { + if ( i==1 ) + entry = setFlagOfText( WC_MIDDLE | WC_BEGIN, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); + else if ( i == splitqlen-2 ) + entry = setFlagOfText( WC_MIDDLE | WC_END, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); + else + entry = setFlagOfText( WC_MIDDLE, lenq + 1 /* MARK_SIGN */ + 2 /* flag */ ); + } + else + entry = NULL; + entry = appendStrToText(entry, items[ i ].ptr, items[ i ].len, lenq+1); + partialmatch[ *nentries ] = true; + entries[ *nentries ] = PointerGetDatum( entry ); + (*nentries)++; + } + } + } + + PG_FREE_IF_COPY(q,0); + +#ifdef OPTIMIZE_WILDCARD_QUERY + if ( *nentries > 1 ) + optimize_wildcard_search( entries, nentries ); +#endif + + PG_RETURN_POINTER(entries); +} + + +PG_FUNCTION_INFO_V1(gin_consistent_wildcard); +Datum gin_consistent_wildcard(PG_FUNCTION_ARGS); +Datum +gin_consistent_wildcard(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + bool res = true; + int i; + int32 nentries; + + if ( fcinfo->flinfo->fn_extra == NULL ) + { + bool *pmatch; + + /* + * we need to get nentries, we'll get it by regular way + * and store it in function context + */ + + fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + sizeof(int32)); + + DirectFunctionCall4( + gin_extract_wildcard, + PG_GETARG_DATUM(2), /* query */ + PointerGetDatum( fcinfo->flinfo->fn_extra ), /* &nentries */ + PG_GETARG_DATUM(1), /* strategy */ + PointerGetDatum( &pmatch ) + ); + } + + nentries = *(int32*) fcinfo->flinfo->fn_extra; + + for (i = 0; res && i < nentries; i++) + if (check[i] == false) + res = false; + + PG_RETURN_BOOL(res); +} + +/* + * Mostly debug fuction + */ +PG_FUNCTION_INFO_V1(permute); +Datum permute(PG_FUNCTION_ARGS); +Datum +permute(PG_FUNCTION_ARGS) +{ + Datum src = PG_GETARG_DATUM(0); + int32 nentries = 0; + Datum *entries; + ArrayType *res; + int i; + + /* + * Get permuted values by gin_extract_permuted() + */ + entries = (Datum*) DatumGetPointer(DirectFunctionCall2( + gin_extract_permuted, src, PointerGetDatum(&nentries) + )); + + /* + * We need to replace MARK_SIGN to MARK_SIGN_SHOW. + * See comments above near definition of MARK_SIGN and MARK_SIGN_SHOW. + */ + if ( nentries == 1 && VARSIZE(entries[0]) == VARHDRSZ + 1) + { + *(VARDATA(entries[0])) = MARK_SIGN_SHOW; + } + else + { + int32 offset = 0; /* offset of MARK_SIGN */ + char *ptr; + + /* + * We scan array from the end because it allows simple calculation + * of MARK_SIGN position: on every iteration it's moved one + * character to the end. + */ + for(i=nentries-1;i>=0;i--) + { + ptr = VARDATA(entries[i]); + + offset += pg_mblen(ptr); + Assert( *(ptr + offset) == MARK_SIGN ); + *(ptr + offset) = MARK_SIGN_SHOW; + } + } + + res = construct_array( + entries, + nentries, + TEXTOID, + -1, + false, + 'i' + ); + + PG_RETURN_POINTER(res); +} diff --git a/wildspeed.sql.in b/wildspeed.sql.in new file mode 100644 index 0000000..a8c5cdb --- /dev/null +++ b/wildspeed.sql.in @@ -0,0 +1,41 @@ +BEGIN; + +-- support functions for gin +CREATE OR REPLACE FUNCTION gin_extract_permuted(text, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE; + +CREATE OR REPLACE FUNCTION wildcmp(text, text, bool) +RETURNS int32 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE; + +CREATE OR REPLACE FUNCTION gin_extract_wildcard(text, internal, int2, internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE; + +CREATE OR REPLACE FUNCTION gin_consistent_wildcard(internal, int2, text) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE; + +CREATE OPERATOR CLASS wildcard_ops +FOR TYPE text USING gin +AS + OPERATOR 1 ~~ RECHECK, + FUNCTION 1 wildcmp(text,text,bool), + FUNCTION 2 gin_extract_permuted(text, internal), + FUNCTION 3 gin_extract_wildcard(text, internal, int2, internal), + FUNCTION 4 gin_consistent_wildcard(internal, int2, text), +STORAGE text; + + +--debug function +CREATE OR REPLACE FUNCTION permute(text) +RETURNS _text +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +COMMIT;