summaryrefslogtreecommitdiff
path: root/contrib/pg_trgm
diff options
context:
space:
mode:
authorTom Lane2010-12-04 05:16:21 +0000
committerTom Lane2010-12-04 05:16:21 +0000
commitb525bf771e31a2254f28bf25c6ed7987d64c8afb (patch)
tree65583461edd171150f868f7f769635f43976d807 /contrib/pg_trgm
parentb576757d7ee064ada5351c2e6a36c2f7234aa1d4 (diff)
Add KNNGIST support to contrib/pg_trgm.
Teodor Sigaev, with some revision by Tom
Diffstat (limited to 'contrib/pg_trgm')
-rw-r--r--contrib/pg_trgm/expected/pg_trgm.out23
-rw-r--r--contrib/pg_trgm/pg_trgm.sql.in21
-rw-r--r--contrib/pg_trgm/sql/pg_trgm.sql4
-rw-r--r--contrib/pg_trgm/trgm.h10
-rw-r--r--contrib/pg_trgm/trgm_gin.c3
-rw-r--r--contrib/pg_trgm/trgm_gist.c140
-rw-r--r--contrib/pg_trgm/trgm_op.c26
-rw-r--r--contrib/pg_trgm/uninstall_pg_trgm.sql6
8 files changed, 192 insertions, 41 deletions
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out
index 98385347295..0532a78e4ae 100644
--- a/contrib/pg_trgm/expected/pg_trgm.out
+++ b/contrib/pg_trgm/expected/pg_trgm.out
@@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+ ?column? | t
+----------+-------------
+ 0.411765 | qwertyu0988
+ 0.5 | qwertyu0987
+(2 rows)
+
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
+explain (costs off)
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+ QUERY PLAN
+---------------------------------------------------
+ Limit
+ -> Index Scan using trgm_idx on test_trgm
+ Order By: (t <-> 'q0987wertyu0988'::text)
+(3 rows)
+
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+ ?column? | t
+----------+-------------
+ 0.411765 | qwertyu0988
+ 0.5 | qwertyu0987
+(2 rows)
+
drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
set enable_seqscan=off;
diff --git a/contrib/pg_trgm/pg_trgm.sql.in b/contrib/pg_trgm/pg_trgm.sql.in
index cce6cd9872f..3e116e8306f 100644
--- a/contrib/pg_trgm/pg_trgm.sql.in
+++ b/contrib/pg_trgm/pg_trgm.sql.in
@@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE;
CREATE OR REPLACE FUNCTION similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
-LANGUAGE C STRICT STABLE;
+LANGUAGE C STRICT STABLE; -- stable because depends on trgm_limit
CREATE OPERATOR % (
LEFTARG = text,
@@ -37,6 +37,18 @@ CREATE OPERATOR % (
JOIN = contjoinsel
);
+CREATE OR REPLACE FUNCTION similarity_dist(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OPERATOR <-> (
+ LEFTARG = text,
+ RIGHTARG = text,
+ PROCEDURE = similarity_dist,
+ COMMUTATOR = '<->'
+);
+
-- gist key
CREATE OR REPLACE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm
@@ -60,6 +72,11 @@ RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
+CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
CREATE OR REPLACE FUNCTION gtrgm_compress(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
@@ -95,6 +112,7 @@ CREATE OPERATOR CLASS gist_trgm_ops
FOR TYPE text USING gist
AS
OPERATOR 1 % (text, text),
+ OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
@@ -102,6 +120,7 @@ AS
FUNCTION 5 gtrgm_penalty (internal, internal, internal),
FUNCTION 6 gtrgm_picksplit (internal, internal),
FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal),
+ FUNCTION 8 gtrgm_distance (internal, text, int, oid),
STORAGE gtrgm;
-- support functions for gin
diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql
index 20d86b8a0d2..5e5539c0057 100644
--- a/contrib/pg_trgm/sql/pg_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -26,6 +26,7 @@ CREATE TABLE test_trgm(t text);
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
@@ -33,6 +34,9 @@ set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+explain (costs off)
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
+select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 85826733f55..1cc812554f7 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -4,12 +4,10 @@
#ifndef __TRGM_H__
#define __TRGM_H__
-#include "postgres.h"
-
#include "access/gist.h"
#include "access/itup.h"
-#include "utils/builtins.h"
#include "storage/bufpage.h"
+#include "utils/builtins.h"
/* options */
#define LPADDING 2
@@ -18,6 +16,10 @@
#define IGNORECASE
#define DIVUNION
+/* operator strategy numbers */
+#define SimilarityStrategyNumber 1
+#define DistanceStrategyNumber 2
+
typedef char trgm[3];
@@ -89,4 +91,4 @@ extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
-#endif
+#endif /* __TRGM_H__ */
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
index 3ce0b2deb55..a5a94ca6755 100644
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -1,6 +1,8 @@
/*
* contrib/pg_trgm/trgm_gin.c
*/
+#include "postgres.h"
+
#include "trgm.h"
#include "access/gin.h"
@@ -10,6 +12,7 @@
#include "utils/array.h"
#include "utils/builtins.h"
+
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 567b2f878ff..d9f3d40c179 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -1,15 +1,19 @@
/*
* contrib/pg_trgm/trgm_gist.c
*/
+#include "postgres.h"
+
#include "trgm.h"
#include "access/gist.h"
#include "access/itup.h"
+#include "access/skey.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
#include "utils/array.h"
#include "utils/builtins.h"
+
PG_FUNCTION_INFO_V1(gtrgm_in);
Datum gtrgm_in(PG_FUNCTION_ARGS);
@@ -25,6 +29,9 @@ Datum gtrgm_decompress(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_consistent);
Datum gtrgm_consistent(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(gtrgm_distance);
+Datum gtrgm_distance(PG_FUNCTION_ARGS);
+
PG_FUNCTION_INFO_V1(gtrgm_union);
Datum gtrgm_union(PG_FUNCTION_ARGS);
@@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS)
}
}
+static int4
+cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
+{
+ int4 count = 0;
+ int4 k,
+ len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ int4 tmp = 0;
+
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ count += GETBIT(sign, HASHVAL(tmp));
+ }
+
+ return count;
+}
+
Datum
gtrgm_consistent(PG_FUNCTION_ARGS)
{
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
text *query = PG_GETARG_TEXT_P(1);
-
- /* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
- bool res = false;
+ bool res;
char *cache = (char *) fcinfo->flinfo->fn_extra;
/* All cases served by this function are exact */
@@ -193,39 +217,95 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
- if (GIST_LEAF(entry))
- { /* all leafs contains orig trgm */
- float4 tmpsml = cnt_sml(key, qtrg);
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ float4 tmpsml = cnt_sml(key, qtrg);
- /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
- res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
+ /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
+ res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
+ int4 len = ARRNELEM(qtrg);
+
+ if (len == 0)
+ res = false;
+ else
+ res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ res = false; /* keep compiler quiet */
+ break;
}
- else if (ISALLTRUE(key))
- { /* non-leaf contains signature */
- res = true;
+
+ PG_RETURN_BOOL(res);
+}
+
+Datum
+gtrgm_distance(PG_FUNCTION_ARGS)
+{
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ text *query = PG_GETARG_TEXT_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ /* Oid subtype = PG_GETARG_OID(3); */
+ TRGM *key = (TRGM *) DatumGetPointer(entry->key);
+ TRGM *qtrg;
+ float8 res;
+ char *cache = (char *) fcinfo->flinfo->fn_extra;
+
+ if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
+ {
+ qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+
+ if (cache)
+ pfree(cache);
+
+ fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+ MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
+ cache = (char *) fcinfo->flinfo->fn_extra;
+
+ memcpy(cache, query, VARSIZE(query));
+ memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
- else
- { /* non-leaf contains signature */
- int4 count = 0;
- int4 k,
- len = ARRNELEM(qtrg);
- trgm *ptr = GETARR(qtrg);
- BITVECP sign = GETSIGN(key);
- int4 tmp = 0;
- for (k = 0; k < len; k++)
- {
- CPTRGM(((char *) &tmp), ptr + k);
- count += GETBIT(sign, HASHVAL(tmp));
- }
-#ifdef DIVUNION
- res = (len == count) ? true : ((((((float4) count) / ((float4) (len - count)))) >= trgm_limit) ? true : false);
-#else
- res = (len == 0) ? false : ((((((float4) count) / ((float4) len))) >= trgm_limit) ? true : false);
-#endif
+ qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
+
+ switch (strategy)
+ {
+ case DistanceStrategyNumber:
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = 1.0 - cnt_sml(key, qtrg);
+ }
+ else if (ISALLTRUE(key))
+ { /* all leafs contains orig trgm */
+ res = 0.0;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
+ int4 len = ARRNELEM(qtrg);
+
+ res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ res = 0; /* keep compiler quiet */
+ break;
}
- PG_RETURN_BOOL(res);
+ PG_RETURN_FLOAT8(res);
}
static int4
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index e15c826e189..b97e9912bad 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -1,11 +1,16 @@
/*
* contrib/pg_trgm/trgm_op.c
*/
-#include "trgm.h"
+#include "postgres.h"
+
#include <ctype.h>
-#include "utils/array.h"
+
+#include "trgm.h"
+
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
+#include "utils/array.h"
+
PG_MODULE_MAGIC;
@@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res);
}
+PG_FUNCTION_INFO_V1(similarity_dist);
+Datum similarity_dist(PG_FUNCTION_ARGS);
+Datum
+similarity_dist(PG_FUNCTION_ARGS)
+{
+ float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
+ PG_RETURN_FLOAT4(1.0 - res);
+}
+
PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS);
Datum
similarity_op(PG_FUNCTION_ARGS)
{
- float4 res = DatumGetFloat4(DirectFunctionCall2(
- similarity,
+ float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
PG_GETARG_DATUM(0),
- PG_GETARG_DATUM(1)
- ));
+ PG_GETARG_DATUM(1)));
PG_RETURN_BOOL(res >= trgm_limit);
}
diff --git a/contrib/pg_trgm/uninstall_pg_trgm.sql b/contrib/pg_trgm/uninstall_pg_trgm.sql
index 6706dd133e1..bc8f1fa983d 100644
--- a/contrib/pg_trgm/uninstall_pg_trgm.sql
+++ b/contrib/pg_trgm/uninstall_pg_trgm.sql
@@ -19,6 +19,8 @@ DROP FUNCTION gtrgm_compress(internal);
DROP FUNCTION gtrgm_consistent(internal,text,int,oid,internal);
+DROP FUNCTION gtrgm_distance(internal,text,int,oid);
+
DROP TYPE gtrgm CASCADE;
DROP OPERATOR CLASS gin_trgm_ops USING gin;
@@ -33,6 +35,10 @@ DROP OPERATOR % (text, text);
DROP FUNCTION similarity_op(text,text);
+DROP OPERATOR <-> (text, text);
+
+DROP FUNCTION similarity_dist(text,text);
+
DROP FUNCTION similarity(text,text);
DROP FUNCTION show_trgm(text);