From: Teodor Sigaev Date: Wed, 25 May 2016 10:24:05 +0000 (+0300) Subject: Make `wildspeed` a modern Postgres extension. X-Git-Url: http://sigaev.ru/git/gitweb.cgi?a=commitdiff_plain;h=refs%2Fheads%2Fmaster;p=wildspeed.git Make `wildspeed` a modern Postgres extension. 1. added control-file. 2. removed unrelated stuff. 3. `expected/wildspeed.out`: added trailing whitespaces produced by `psql` Petr Korobeinikov --- diff --git a/.gitignore b/.gitignore index 46eea3c..b4b5f16 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,11 @@ -.gitignore -*.swp *.o *.a +*.so *.d *.orig *.rej -temp -diffs -etc -results +*.gcda +*.gcno +/.deps/ +/diffs/ +/results/ diff --git a/Makefile b/Makefile index 78d2dc0..511806e 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,11 @@ PG_CPPFLAGS = -DOPTIMIZE_WILDCARD_QUERY MODULE_big = wildspeed -OBJS = wildspeed.o +OBJS = wildspeed.o + +EXTENSION = wildspeed +DATA = wildspeed--1.0.sql +PGFILEDESC = "Wildspeed - fast wildcard search for LIKE operator" -DATA_built = wildspeed.sql -DATA = uninstall_wildspeed.sql REGRESS = wildspeed ifdef USE_PGXS @@ -15,4 +17,4 @@ subdir = contrib/wildspeed top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk -endif \ No newline at end of file +endif diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc404f0 --- /dev/null +++ b/README.md @@ -0,0 +1,159 @@ +# Wildspeed - fast wildcard search for LIKE operator + +Wildspeed extension provides GIN index support for wildcard search +for LIKE operator. + +Online version of this document is available +http://www.sai.msu.su/~megera/wiki/wildspeed + +## Authors + +* Oleg Bartunov , Moscow, Moscow University, Russia +* Teodor Sigaev , Moscow, Moscow University,Russia + +## License + +Stable version, included into PostgreSQL distribution, released under +BSD license. Development version, available from this site, released +under the GNU General Public License, version 2 (June 1991) + +## Downloads + +Stable version of wildspeed is available from +http://www.sigaev.ru/cvsweb/cvsweb.cgi/wildspeed/ + +## Installation + + % make USE_PGXS=1 + % make install + % make installcheck + % psql DB -c 'CREATE EXTENSION wildspeed' + +Wildspeed provides opclass (wildcard_ops) and uses partial match +feature of GIN, available since 8.4. Also, it supports full index scan. + +The size of index can be very big, since it contains entries for all +permutations of the original word, see [1] for details. For example, +word hello will be indexed as well as its all permutations: + + =# select permute('hello'); + permute + -------------------------------------- + {hello$,ello$h,llo$he,lo$hel,o$hell} + + Notice, symbol '$' is used only for visualization, in actual + implementation null-symbol '\0' is used. + + Search query rewritten as prefix search: + *X -> X$* + X*Y -> Y$X* + *X* -> X* + +For example, search for 'hel*o' will be rewritten as 'o$hel'. + +Special function `permute(TEXT)`, which returns all permutations of +argument, provided for test purposes. + +Performance of wildspeed depends on search pattern. Basically, +wildspeed is less effective than btree index with text_pattern_ops for +prefix search (the difference is greatly reduced for long prefixes) and +much faster for wildcard search. + +Wildspeed by default uses optimization (skip short patterns if there +are long one), which can be turned off in Makefile by removing define +`-DOPTIMIZE_WILDCARD_QUERY`. + +## References + +* http://www.cs.wright.edu/~tkprasad/courses/cs499/L05TolerantIR.ppt +* http://nlp.stanford.edu/IR-book/html/htmledition/permuterm-indexes-1.html + +## Examples + +Table words contains 747358 records, w1 and w2 columns contains the +same data in order to test performance of Btree (w1) and GIN (w2) +indexes: + + Table "public.words" + Column | Type | Modifiers + --------+------+----------- + w1 | text | + w2 | text | + + words=# create index bt_idx on words using btree (w1 text_pattern_ops); + CREATE INDEX + Time: 1885.195 ms + words=# create index gin_idx on words using gin (w2 wildcard_ops); + vacuum analyze; + CREATE INDEX + Time: 530351.223 ms + +Size: + + words=# select pg_relation_size('words'); + pg_relation_size + ------------------ + 43253760 + + words=# select pg_relation_size('gin_idx'); + pg_relation_size + ------------------ + 417816576 + (1 row) + + words=# select pg_relation_size('bt_idx'); + pg_relation_size + ------------------ + 23437312 + (1 row) + + Prefix search: + words=# select count(*) from words where w1 like 'a%'; + count + ------- + 15491 + (1 row) + + Time: 7.502 ms + words=# select count(*) from words where w2 like 'a%'; + count + ------- + 15491 + (1 row) + + Time: 31.152 ms + +Wildcard search: + + words=# select count(*) from words where w1 like '%asd%'; + count + ------- + 26 + (1 row) + + Time: 147.308 ms + words=# select count(*) from words where w2 like '%asd%'; + count + ------- + 26 + (1 row) + + Time: 0.339 ms + +Full index scan: + + words=# set enable_seqscan to off; + words=# explain analyze select count(*) from words where w2 like '%'; + QUERY PLAN + + -------------------------------------------------------------------------------------------------------------- + ----------------------------- + Aggregate (cost=226274.98..226274.99 rows=1 width=0) (actual time=2218.709..2218.709 rows=1 loops=1) + -> Bitmap Heap Scan on words (cost=209785.73..224406.77 rows=747283 width=0) (actual time=1510.516..1913. + 430 rows=747358 loops=1) + Filter: (w2 ~~ '%'::text) + -> Bitmap Index Scan on gin_idx (cost=0.00..209598.91 rows=747283 width=0) (actual time=1509.358..1 + 509.358 rows=747358 loops=1) + Index Cond: (w2 ~~ '%'::text) + Total runtime: 2218.747 ms + (6 rows) diff --git a/README.wildspeed b/README.wildspeed deleted file mode 100644 index 1bee021..0000000 --- a/README.wildspeed +++ /dev/null @@ -1,158 +0,0 @@ -WildSpeed - fast wildcard search for LIKE operator - - Wildspeed extension provides GIN index support for wildcard search - for LIKE operator. - - Online version of this document is available - http://www.sai.msu.su/~megera/wiki/wildspeed - -Authors - - * Oleg Bartunov , Moscow, Moscow University, Russia - * Teodor Sigaev , Moscow, Moscow University,Russia - -License - - Stable version, included into PostgreSQL distribution, released under - BSD license. Development version, available from this site, released - under the GNU General Public License, version 2 (June 1991) - -Downloads - - Stable version of wildspeed is available from - http://www.sigaev.ru/cvsweb/cvsweb.cgi/wildspeed/ - -Installation - -% cd PGSQLSRC/contrib -% tar xzvf wildspeed.tar.gz -% make -% make install -% make installcheck -% psql DB < wildspeed.sql - - Wildspeed provides opclass (wildcard_ops) and uses partial match - feature of GIN, available since 8.4. Also, it supports full index scan. - - The size of index can be very big, since it contains entries for all - permutations of the original word, see [1] for details. For example, - word hello will be indexed as well as its all permutations: -=# select permute('hello'); - permute --------------------------------------- - {hello$,ello$h,llo$he,lo$hel,o$hell} - - Notice, symbol '$' is used only for visualization, in actual - implementation null-symbol '\0' is used. - - Search query rewritten as prefix search: -*X -> X$* -X*Y -> Y$X* -*X* -> X* - - For example, search for 'hel*o' will be rewritten as 'o$hel'. - - Special function permute(TEXT), which returns all permutations of - argument, provided for test purposes. - - Performance of wildspeed depends on search pattern. Basically, - wildspeed is less effective than btree index with text_pattern_ops for - prefix search (the difference is greatly reduced for long prefixes) and - much faster for wildcard search. - - Wildspeed by default uses optimization (skip short patterns if there - are long one), which can be turned off in Makefile by removing define - -DOPTIMIZE_WILDCARD_QUERY. - -References - - 1. http://www.cs.wright.edu/~tkprasad/courses/cs499/L05TolerantIR.ppt, see also, - http://nlp.stanford.edu/IR-book/html/htmledition/permuterm-indexes-1.html - -Examples - - Table words contains 747358 records, w1 and w2 columns contains the - same data in order to test performance of Btree (w1) and GIN (w2) - indexes: - Table "public.words" - Column | Type | Modifiers ---------+------+----------- - w1 | text | - w2 | text | - -words=# create index bt_idx on words using btree (w1 text_pattern_ops); -CREATE INDEX -Time: 1885.195 ms -words=# create index gin_idx on words using gin (w2 wildcard_ops); -vacuum analyze; -CREATE INDEX -Time: 530351.223 ms - -Size: - -words=# select pg_relation_size('words'); - pg_relation_size ------------------- - 43253760 - -words=# select pg_relation_size('gin_idx'); - pg_relation_size ------------------- - 417816576 -(1 row) - -words=# select pg_relation_size('bt_idx'); - pg_relation_size ------------------- - 23437312 -(1 row) - - Prefix search: -words=# select count(*) from words where w1 like 'a%'; - count -------- - 15491 -(1 row) - -Time: 7.502 ms -words=# select count(*) from words where w2 like 'a%'; - count -------- - 15491 -(1 row) - -Time: 31.152 ms - - Wildcard search: -words=# select count(*) from words where w1 like '%asd%'; - count -------- - 26 -(1 row) - -Time: 147.308 ms -words=# select count(*) from words where w2 like '%asd%'; - count -------- - 26 -(1 row) - -Time: 0.339 ms - - Full index scan: -words=# set enable_seqscan to off; -words=# explain analyze select count(*) from words where w2 like '%'; - QUERY PLAN - --------------------------------------------------------------------------------------------------------------- ------------------------------ - Aggregate (cost=226274.98..226274.99 rows=1 width=0) (actual time=2218.709..2218.709 rows=1 loops=1) - -> Bitmap Heap Scan on words (cost=209785.73..224406.77 rows=747283 width=0) (actual time=1510.516..1913. -430 rows=747358 loops=1) - Filter: (w2 ~~ '%'::text) - -> Bitmap Index Scan on gin_idx (cost=0.00..209598.91 rows=747283 width=0) (actual time=1509.358..1 -509.358 rows=747358 loops=1) - Index Cond: (w2 ~~ '%'::text) - Total runtime: 2218.747 ms -(6 rows) - diff --git a/expected/wildspeed.out b/expected/wildspeed.out index b5a85fc..82a0ea9 100644 --- a/expected/wildspeed.out +++ b/expected/wildspeed.out @@ -1,6 +1,6 @@ -- -- first, define the datatype. Turn off echoing so that expected file --- does not depend on contents of wildspeed.sql. +-- does not depend on `CREATE EXTENSION wildspeed` output. -- SET client_min_messages = warning; \set ECHO none diff --git a/sql/wildspeed.sql b/sql/wildspeed.sql index d536e69..6647246 100644 --- a/sql/wildspeed.sql +++ b/sql/wildspeed.sql @@ -1,11 +1,11 @@ -- -- first, define the datatype. Turn off echoing so that expected file --- does not depend on contents of wildspeed.sql. +-- does not depend on `CREATE EXTENSION wildspeed` output. -- SET client_min_messages = warning; \set ECHO none -\i wildspeed.sql +CREATE EXTENSION wildspeed; \set ECHO all RESET client_min_messages; diff --git a/uninstall_wildspeed.sql b/uninstall_wildspeed.sql deleted file mode 100644 index 978c142..0000000 --- a/uninstall_wildspeed.sql +++ /dev/null @@ -1,10 +0,0 @@ -BEGIN; - -DROP FUNCTION IF EXISTS permute(text) CASCADE; -DROP OPERATOR CLASS IF EXISTS wildcard_ops USING gin CASCADE; -DROP FUNCTION IF EXISTS gin_extract_permuted(text, internal) CASCADE; -DROP FUNCTION IF EXISTS wildcmp(text, text, bool) CASCADE; -DROP FUNCTION IF EXISTS gin_extract_wildcard(text, internal, int2, internal) CASCADE; -DROP FUNCTION IF EXISTS gin_consistent_wildcard(internal, int2, text) CASCADE; - -END; diff --git a/wildspeed.sql.in b/wildspeed--1.0.sql similarity index 87% rename from wildspeed.sql.in rename to wildspeed--1.0.sql index 4cbaa5a..39f7b43 100644 --- a/wildspeed.sql.in +++ b/wildspeed--1.0.sql @@ -1,4 +1,5 @@ -BEGIN; +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION wildspeed" to load this file. \quit -- support functions for gin CREATE OR REPLACE FUNCTION gin_extract_permuted(text, internal) @@ -29,7 +30,7 @@ LANGUAGE C IMMUTABLE; CREATE OPERATOR CLASS wildcard_ops FOR TYPE text USING gin AS - OPERATOR 1 ~~, + OPERATOR 1 ~~, FUNCTION 1 wildcmp(text,text), FUNCTION 2 gin_extract_permuted(text, internal), FUNCTION 3 gin_extract_wildcard(text, internal, int2, internal), @@ -43,5 +44,3 @@ CREATE OR REPLACE FUNCTION permute(text) RETURNS _text AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE; - -COMMIT; diff --git a/wildspeed.control b/wildspeed.control new file mode 100644 index 0000000..5ed1d45 --- /dev/null +++ b/wildspeed.control @@ -0,0 +1,5 @@ +# wildspeed extension +comment = 'Wildspeed - fast wildcard search for LIKE operator' +default_version = '1.0' +module_pathname = '$libdir/wildspeed' +relocatable = true