mirror of
https://github.com/lxsang/silk.git
synced 2024-11-14 01:08:22 +01:00
704 lines
13 KiB
C
704 lines
13 KiB
C
/* This is the Porter stemming algorithm, coded up in ANSI C by the
|
|
* author. It may be be regarded as canonical, in that it follows the
|
|
* algorithm presented in
|
|
*
|
|
* Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
* no. 3, pp 130-137,
|
|
*
|
|
* only differing from it at the points marked --DEPARTURE-- below.
|
|
*
|
|
* See also http://www.tartarus.org/~martin/PorterStemmer
|
|
*
|
|
* The algorithm as described in the paper could be exactly replicated
|
|
* by adjusting the points of DEPARTURE, but this is barely necessary,
|
|
* because (a) the points of DEPARTURE are definitely improvements, and
|
|
* (b) no encoding of the Porter stemmer I have seen is anything like
|
|
* as exact as this version, even with the points of DEPARTURE!
|
|
*
|
|
* You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
|
|
* 'stem' takes a list of inputs and sends the stemmed equivalent to
|
|
* stdout.
|
|
*
|
|
* The algorithm as encoded here is particularly fast.
|
|
*
|
|
* Release 1: was many years ago
|
|
* Release 2: 11 Apr 2013
|
|
* fixes a bug noted by Matt Patenaude <matt@mattpatenaude.com>,
|
|
*
|
|
* case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
|
|
* ==>
|
|
* case 'o': if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) break;
|
|
*
|
|
* to avoid accessing b[k0-1] when the word in b is "ion".
|
|
* Release 3: 25 Mar 2014
|
|
* fixes a similar bug noted by Klemens Baum <klemensbaum@gmail.com>,
|
|
* that if step1ab leaves a one letter result (ied -> i, aing -> a etc),
|
|
* step2 and step4 access the byte before the first letter. So we skip
|
|
* steps after step1ab unless k > k0. */
|
|
#include <string.h>
|
|
#include "lua/lualib.h"
|
|
#define TRUE 1
|
|
#define FALSE 0
|
|
|
|
/* The main part of the stemming algorithm starts here. b is a buffer
|
|
* holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
|
|
* ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
|
|
* downwards as the stemming progresses. Zero termination is not in fact
|
|
* used in the algorithm.
|
|
*
|
|
* Note that only lower case sequences are stemmed. Forcing to lower case
|
|
* should be done before stem(...) is called. */
|
|
|
|
/* buffer for word to be stemmed */
|
|
static char *b;
|
|
|
|
static int k;
|
|
static int k0;
|
|
|
|
/* j is a general offset into the string */
|
|
static int j;
|
|
|
|
/**
|
|
* TRUE when `b[i]` is a consonant.
|
|
*/
|
|
|
|
static int
|
|
isConsonant(int index) {
|
|
switch (b[index]) {
|
|
case 'a':
|
|
case 'e':
|
|
case 'i':
|
|
case 'o':
|
|
case 'u':
|
|
return FALSE;
|
|
case 'y':
|
|
return (index == k0) ? TRUE : !isConsonant(index - 1);
|
|
default:
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
/* Measure the number of consonant sequences between
|
|
* `k0` and `j`. If C is a consonant sequence and V
|
|
* a vowel sequence, and <..> indicates arbitrary
|
|
* presence:
|
|
*
|
|
* <C><V> gives 0
|
|
* <C>VC<V> gives 1
|
|
* <C>VCVC<V> gives 2
|
|
* <C>VCVCVC<V> gives 3
|
|
* ....
|
|
*/
|
|
static int
|
|
getMeasure() {
|
|
int position;
|
|
int index;
|
|
|
|
position = 0;
|
|
index = k0;
|
|
|
|
while (TRUE) {
|
|
if (index > j) {
|
|
return position;
|
|
}
|
|
|
|
if (!isConsonant(index)) {
|
|
break;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
|
|
index++;
|
|
|
|
while (TRUE) {
|
|
while (TRUE) {
|
|
if (index > j) {
|
|
return position;
|
|
}
|
|
|
|
if (isConsonant(index)) {
|
|
break;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
|
|
index++;
|
|
position++;
|
|
|
|
while (TRUE) {
|
|
if (index > j) {
|
|
return position;
|
|
}
|
|
|
|
if (!isConsonant(index)) {
|
|
break;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
}
|
|
|
|
/* `TRUE` when `k0, ... j` contains a vowel. */
|
|
static int
|
|
vowelInStem() {
|
|
int index;
|
|
|
|
index = k0 - 1;
|
|
|
|
while (++index <= j) {
|
|
if (!isConsonant(index)) {
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* `TRUE` when `j` and `(j-1)` are the same consonant. */
|
|
static int
|
|
isDoubleConsonant(int index) {
|
|
if (b[index] != b[index - 1]) {
|
|
return FALSE;
|
|
}
|
|
|
|
return isConsonant(index);
|
|
}
|
|
|
|
/* `TRUE` when `i - 2, i - 1, i` has the form
|
|
* `consonant - vowel - consonant` and also if the second
|
|
* C is not `"w"`, `"x"`, or `"y"`. this is used when
|
|
* trying to restore an `e` at the end of a short word.
|
|
*
|
|
* Such as:
|
|
*
|
|
* `cav(e)`, `lov(e)`, `hop(e)`, `crim(e)`, but `snow`,
|
|
* `box`, `tray`.
|
|
*/
|
|
static int
|
|
cvc(int index) {
|
|
int character;
|
|
|
|
if (index < k0 + 2 || !isConsonant(index) || isConsonant(index - 1) || !isConsonant(index - 2)) {
|
|
return FALSE;
|
|
}
|
|
|
|
character = b[index];
|
|
|
|
if (character == 'w' || character == 'x' || character == 'y') {
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* `ends(s)` is `TRUE` when `k0, ...k` ends with `value`. */
|
|
static int
|
|
ends(const char *value) {
|
|
int length = value[0];
|
|
|
|
/* Tiny speed-up. */
|
|
if (value[length] != b[k]) {
|
|
return FALSE;
|
|
}
|
|
|
|
if (length > k - k0 + 1) {
|
|
return FALSE;
|
|
}
|
|
|
|
if (memcmp(b + k - length + 1, value + 1, length) != 0) {
|
|
return FALSE;
|
|
}
|
|
|
|
j = k - length;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* `setTo(value)` sets `(j + 1), ...k` to the characters in
|
|
* `value`, readjusting `k`. */
|
|
static void
|
|
setTo(const char *value) {
|
|
int length = value[0];
|
|
|
|
memmove(b + j + 1, value + 1, length);
|
|
|
|
k = j + length;
|
|
}
|
|
|
|
/* Set string. */
|
|
static void
|
|
replace(const char *value) {
|
|
if (getMeasure() > 0) {
|
|
setTo(value);
|
|
}
|
|
}
|
|
|
|
/* `step1ab()` gets rid of plurals, `-ed`, `-ing`.
|
|
*
|
|
* Such as:
|
|
*
|
|
* caresses -> caress
|
|
* ponies -> poni
|
|
* ties -> ti
|
|
* caress -> caress
|
|
* cats -> cat
|
|
*
|
|
* feed -> feed
|
|
* agreed -> agree
|
|
* disabled -> disable
|
|
*
|
|
* matting -> mat
|
|
* mating -> mate
|
|
* meeting -> meet
|
|
* milling -> mill
|
|
* messing -> mess
|
|
*
|
|
* meetings -> meet
|
|
*/
|
|
static void
|
|
step1ab() {
|
|
int character;
|
|
|
|
if (b[k] == 's') {
|
|
if (ends("\04" "sses")) {
|
|
k -= 2;
|
|
} else if (ends("\03" "ies")) {
|
|
setTo("\01" "i");
|
|
} else if (b[k - 1] != 's') {
|
|
k--;
|
|
}
|
|
}
|
|
|
|
if (ends("\03" "eed")) {
|
|
if (getMeasure() > 0) {
|
|
k--;
|
|
}
|
|
} else if ((ends("\02" "ed") || ends("\03" "ing")) && vowelInStem()) {
|
|
k = j;
|
|
|
|
if (ends("\02" "at")) {
|
|
setTo("\03" "ate");
|
|
} else if (ends("\02" "bl")) {
|
|
setTo("\03" "ble");
|
|
} else if (ends("\02" "iz")) {
|
|
setTo("\03" "ize");
|
|
} else if (isDoubleConsonant(k)) {
|
|
k--;
|
|
|
|
character = b[k];
|
|
|
|
if (character == 'l' || character == 's' || character == 'z') {
|
|
k++;
|
|
}
|
|
} else if (getMeasure() == 1 && cvc(k)) {
|
|
setTo("\01" "e");
|
|
}
|
|
}
|
|
}
|
|
|
|
/* `step1c()` turns terminal `"y"` to `"i"` when there
|
|
* is another vowel in the stem. */
|
|
static void
|
|
step1c() {
|
|
if (ends("\01" "y") && vowelInStem()) {
|
|
b[k] = 'i';
|
|
}
|
|
}
|
|
|
|
/* `step2()` maps double suffices to single ones.
|
|
* so -ization ( = -ize plus -ation) maps to -ize etc.
|
|
* note that the string before the suffix must give
|
|
* getMeasure() > 0. */
|
|
static void
|
|
step2() {
|
|
switch (b[k - 1]) {
|
|
case 'a':
|
|
if (ends("\07" "ational")) {
|
|
replace("\03" "ate");
|
|
break;
|
|
}
|
|
|
|
if (ends("\06" "tional")) {
|
|
replace("\04" "tion");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'c':
|
|
if (ends("\04" "enci")) {
|
|
replace("\04" "ence");
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "anci")) {
|
|
replace("\04" "ance");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'e':
|
|
if (ends("\04" "izer")) {
|
|
replace("\03" "ize");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'l':
|
|
/* --DEPARTURE--: To match the published algorithm,
|
|
* replace this line with:
|
|
*
|
|
* ```
|
|
* if (ends("\04" "abli")) {
|
|
* replace("\04" "able");
|
|
*
|
|
* break;
|
|
* }
|
|
* ```
|
|
*/
|
|
if (ends("\03" "bli")) {
|
|
replace("\03" "ble");
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "alli")) {
|
|
replace("\02" "al");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "entli")) {
|
|
replace("\03" "ent");
|
|
break;
|
|
}
|
|
|
|
if (ends("\03" "eli")) {
|
|
replace("\01" "e");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "ousli")) {
|
|
replace("\03" "ous");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'o':
|
|
if (ends("\07" "ization")) {
|
|
replace("\03" "ize");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "ation")) {
|
|
replace("\03" "ate");
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "ator")) {
|
|
replace("\03" "ate");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 's':
|
|
if (ends("\05" "alism")) {
|
|
replace("\02" "al");
|
|
break;
|
|
}
|
|
|
|
if (ends("\07" "iveness")) {
|
|
replace("\03" "ive");
|
|
break;
|
|
}
|
|
|
|
if (ends("\07" "fulness")) {
|
|
replace("\03" "ful");
|
|
break;
|
|
}
|
|
|
|
if (ends("\07" "ousness")) {
|
|
replace("\03" "ous");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 't':
|
|
if (ends("\05" "aliti")) {
|
|
replace("\02" "al");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "iviti")) {
|
|
replace("\03" "ive");
|
|
break;
|
|
}
|
|
|
|
if (ends("\06" "biliti")) {
|
|
replace("\03" "ble");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
/* --DEPARTURE--: To match the published algorithm, delete this line. */
|
|
case 'g':
|
|
if (ends("\04" "logi")) {
|
|
replace("\03" "log");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* `step3()` deals with -ic-, -full, -ness etc.
|
|
* similar strategy to step2. */
|
|
static void
|
|
step3() {
|
|
switch (b[k]) {
|
|
case 'e':
|
|
if (ends("\05" "icate")) {
|
|
replace("\02" "ic");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "ative")) {
|
|
replace("\00" "");
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "alize")) {
|
|
replace("\02" "al");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'i':
|
|
if (ends("\05" "iciti")) {
|
|
replace("\02" "ic");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 'l':
|
|
if (ends("\04" "ical")) {
|
|
replace("\02" "ic");
|
|
break;
|
|
}
|
|
|
|
if (ends("\03" "ful")) {
|
|
replace("\00" "");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case 's':
|
|
if (ends("\04" "ness")) {
|
|
replace("\00" "");
|
|
break;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* `step4()` takes off -ant, -ence etc., in
|
|
* context <c>vcvc<v>. */
|
|
static void
|
|
step4() {
|
|
switch (b[k - 1]) {
|
|
case 'a':
|
|
if (ends("\02" "al")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'c':
|
|
if (ends("\04" "ance")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "ence")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'e':
|
|
if (ends("\02" "er")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'i':
|
|
if (ends("\02" "ic")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'l':
|
|
if (ends("\04" "able")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "ible")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'n':
|
|
if (ends("\03" "ant")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\05" "ement")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\04" "ment")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\03" "ent")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'o':
|
|
if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) {
|
|
break;
|
|
}
|
|
|
|
/* takes care of -ous */
|
|
if (ends("\02" "ou")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 's':
|
|
if (ends("\03" "ism")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 't':
|
|
if (ends("\03" "ate")) {
|
|
break;
|
|
}
|
|
|
|
if (ends("\03" "iti")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'u':
|
|
if (ends("\03" "ous")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'v':
|
|
if (ends("\03" "ive")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
case 'z':
|
|
if (ends("\03" "ize")) {
|
|
break;
|
|
}
|
|
|
|
return;
|
|
default:
|
|
return;
|
|
}
|
|
|
|
if (getMeasure() > 1) {
|
|
k = j;
|
|
}
|
|
}
|
|
|
|
/* `step5()` removes a final `-e` if `getMeasure()` is
|
|
* greater than `1`, and changes `-ll` to `-l` if
|
|
* `getMeasure()` is greater than `1`. */
|
|
static void
|
|
step5() {
|
|
int a;
|
|
|
|
j = k;
|
|
|
|
if (b[k] == 'e') {
|
|
a = getMeasure();
|
|
|
|
if (a > 1 || (a == 1 && !cvc(k - 1))) {
|
|
k--;
|
|
}
|
|
}
|
|
|
|
if (b[k] == 'l' && isDoubleConsonant(k) && getMeasure() > 1) {
|
|
k--;
|
|
}
|
|
}
|
|
|
|
/* In `stem(p, i, j)`, `p` is a `char` pointer, and the
|
|
* string to be stemmed is from `p[i]` to
|
|
* `p[j]` (inclusive).
|
|
*
|
|
* Typically, `i` is zero and `j` is the offset to the
|
|
* last character of a string, `(p[j + 1] == '\0')`.
|
|
* The stemmer adjusts the characters `p[i]` ... `p[j]`
|
|
* and returns the new end-point of the string, `k`.
|
|
*
|
|
* Stemming never increases word length, so `i <= k <= j`.
|
|
*
|
|
* To turn the stemmer into a module, declare 'stem' as
|
|
* extern, and delete the remainder of this file. */
|
|
int
|
|
stem(char *p, int index, int position) {
|
|
/* Copy the parameters into statics. */
|
|
b = p;
|
|
k = position;
|
|
k0 = index;
|
|
|
|
if (k <= k0 + 1) {
|
|
return k; /* --DEPARTURE-- */
|
|
}
|
|
|
|
/* With this line, strings of length 1 or 2 don't
|
|
* go through the stemming process, although no
|
|
* mention is made of this in the published
|
|
* algorithm. Remove the line to match the published
|
|
* algorithm. */
|
|
step1ab();
|
|
|
|
if (k > k0) {
|
|
step1c();
|
|
step2();
|
|
step3();
|
|
step4();
|
|
step5();
|
|
}
|
|
|
|
return k;
|
|
}
|
|
|
|
|
|
static int l_stmr(lua_State* L)
|
|
{
|
|
char* word = strdup(luaL_checkstring(L,1));
|
|
int end = stem(word, 0, strlen(word) - 1);
|
|
word[end + 1] = 0;
|
|
lua_pushstring(L, word);
|
|
free(word);
|
|
return 1;
|
|
}
|
|
|
|
static const struct luaL_Reg _lib [] = {
|
|
{"stmr", l_stmr},
|
|
{NULL,NULL}
|
|
};
|
|
|
|
int luaopen_stmr(lua_State *L)
|
|
{
|
|
luaL_newlib(L, _lib);
|
|
return 1;
|
|
} |