Skip to content

Commit cbcc872

Browse files
committed
Update snowball
Update to snowball tag v2.0.0. Major changes are new stemmers for Basque, Catalan, and Hindi. Discussion: https://www.postgresql.org/message-id/flat/a8eeabd6-2be1-43fe-401e-a97594c38478%402ndquadrant.com
1 parent 57cb806 commit cbcc872

File tree

97 files changed

+6914
-2166
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+6914
-2166
lines changed

src/backend/snowball/Makefile

+8
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ OBJS = \
2323
utilities.o
2424

2525
OBJS += \
26+
stem_ISO_8859_1_basque.o \
27+
stem_ISO_8859_1_catalan.o \
2628
stem_ISO_8859_1_danish.o \
2729
stem_ISO_8859_1_dutch.o \
2830
stem_ISO_8859_1_english.o \
@@ -41,13 +43,16 @@ OBJS += \
4143
stem_ISO_8859_2_romanian.o \
4244
stem_KOI8_R_russian.o \
4345
stem_UTF_8_arabic.o \
46+
stem_UTF_8_basque.o \
47+
stem_UTF_8_catalan.o \
4448
stem_UTF_8_danish.o \
4549
stem_UTF_8_dutch.o \
4650
stem_UTF_8_english.o \
4751
stem_UTF_8_finnish.o \
4852
stem_UTF_8_french.o \
4953
stem_UTF_8_german.o \
5054
stem_UTF_8_greek.o \
55+
stem_UTF_8_hindi.o \
5156
stem_UTF_8_hungarian.o \
5257
stem_UTF_8_indonesian.o \
5358
stem_UTF_8_irish.o \
@@ -70,13 +75,16 @@ OBJS += \
7075
# must come after creation of that language
7176
LANGUAGES= \
7277
arabic arabic \
78+
basque basque \
79+
catalan catalan \
7380
danish danish \
7481
dutch dutch \
7582
english english \
7683
finnish finnish \
7784
french french \
7885
german german \
7986
greek greek \
87+
hindi english \
8088
hungarian hungarian \
8189
indonesian indonesian \
8290
irish irish \

src/backend/snowball/README

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ This module uses the word stemming code developed by the Snowball project,
77
http://snowballstem.org (formerly http://snowball.tartarus.org)
88
which is released by them under a BSD-style license.
99

10-
The Snowball project is not currently making formal releases; it's best
10+
The Snowball project does not often make formal releases; it's best
1111
to pull from their git repository
1212

1313
git clone https://github.com/snowballstem/snowball.git
@@ -29,8 +29,8 @@ We choose to include the derived files in the PostgreSQL distribution
2929
because most installations will not have the Snowball compiler available.
3030

3131
We are currently synced with the Snowball git commit
32-
4456b82c26c02493e8807a66f30593a98c5d2888
33-
of 2019-06-24.
32+
c70ed64f9d41c1032fba4e962b054f8e9d489a74 (tag v2.0.0)
33+
of 2019-10-02.
3434

3535
To update the PostgreSQL sources from a new Snowball version:
3636

@@ -44,6 +44,9 @@ do
4444
sed 's|\.\./runtime/header\.h|header.h|' $f >libstemmer/`basename $f`
4545
done
4646

47+
Do not copy stemmers that are listed in libstemmer/modules.txt as
48+
nonstandard, such as "german2" or "lovins".
49+
4750
2. Copy the *.c files in snowball/runtime/ to
4851
src/backend/snowball/libstemmer, and edit them to remove direct inclusions
4952
of system headers such as <stdio.h> --- they should only include "header.h".

src/backend/snowball/dict_snowball.c

+10
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
/* Now we can include the original Snowball header.h */
2828
#include "snowball/libstemmer/header.h"
29+
#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
30+
#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
2931
#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
3032
#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
3133
#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
@@ -44,13 +46,16 @@
4446
#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
4547
#include "snowball/libstemmer/stem_KOI8_R_russian.h"
4648
#include "snowball/libstemmer/stem_UTF_8_arabic.h"
49+
#include "snowball/libstemmer/stem_UTF_8_basque.h"
50+
#include "snowball/libstemmer/stem_UTF_8_catalan.h"
4751
#include "snowball/libstemmer/stem_UTF_8_danish.h"
4852
#include "snowball/libstemmer/stem_UTF_8_dutch.h"
4953
#include "snowball/libstemmer/stem_UTF_8_english.h"
5054
#include "snowball/libstemmer/stem_UTF_8_finnish.h"
5155
#include "snowball/libstemmer/stem_UTF_8_french.h"
5256
#include "snowball/libstemmer/stem_UTF_8_german.h"
5357
#include "snowball/libstemmer/stem_UTF_8_greek.h"
58+
#include "snowball/libstemmer/stem_UTF_8_hindi.h"
5459
#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
5560
#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
5661
#include "snowball/libstemmer/stem_UTF_8_irish.h"
@@ -92,6 +97,8 @@ static const stemmer_module stemmer_modules[] =
9297
/*
9398
* Stemmers list from Snowball distribution
9499
*/
100+
STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
101+
STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
95102
STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
96103
STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
97104
STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
@@ -110,13 +117,16 @@ static const stemmer_module stemmer_modules[] =
110117
STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
111118
STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
112119
STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
120+
STEMMER_MODULE(basque, PG_UTF8, UTF_8),
121+
STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
113122
STEMMER_MODULE(danish, PG_UTF8, UTF_8),
114123
STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
115124
STEMMER_MODULE(english, PG_UTF8, UTF_8),
116125
STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
117126
STEMMER_MODULE(french, PG_UTF8, UTF_8),
118127
STEMMER_MODULE(german, PG_UTF8, UTF_8),
119128
STEMMER_MODULE(greek, PG_UTF8, UTF_8),
129+
STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
120130
STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
121131
STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
122132
STEMMER_MODULE(irish, PG_UTF8, UTF_8),

src/backend/snowball/libstemmer/api.c

-1
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
6161
z->c = 0;
6262
return err;
6363
}
64-

0 commit comments

Comments
 (0)