/* This is the Porter stemming algorithm, coded up in ANSI C by the * author. It may be be regarded as canonical, in that it follows the * algorithm presented in * * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, * no. 3, pp 130-137, * * only differing from it at the points marked --DEPARTURE-- below. * * See also http://www.tartarus.org/~martin/PorterStemmer * * The algorithm as described in the paper could be exactly replicated * by adjusting the points of DEPARTURE, but this is barely necessary, * because (a) the points of DEPARTURE are definitely improvements, and * (b) no encoding of the Porter stemmer I have seen is anything like * as exact as this version, even with the points of DEPARTURE! * * You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which * 'stem' takes a list of inputs and sends the stemmed equivalent to * stdout. * * The algorithm as encoded here is particularly fast. * * Release 1: was many years ago * Release 2: 11 Apr 2013 * fixes a bug noted by Matt Patenaude , * * case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break; * ==> * case 'o': if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) break; * * to avoid accessing b[k0-1] when the word in b is "ion". * Release 3: 25 Mar 2014 * fixes a similar bug noted by Klemens Baum , * that if step1ab leaves a one letter result (ied -> i, aing -> a etc), * step2 and step4 access the byte before the first letter. So we skip * steps after step1ab unless k > k0. */ #include #include "lua/lualib.h" #define TRUE 1 #define FALSE 0 /* The main part of the stemming algorithm starts here. b is a buffer * holding a word to be stemmed. The letters are in b[k0], b[k0+1] ... * ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted * downwards as the stemming progresses. Zero termination is not in fact * used in the algorithm. * * Note that only lower case sequences are stemmed. Forcing to lower case * should be done before stem(...) is called. */ /* buffer for word to be stemmed */ static char *b; static int k; static int k0; /* j is a general offset into the string */ static int j; /** * TRUE when `b[i]` is a consonant. */ static int isConsonant(int index) { switch (b[index]) { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; case 'y': return (index == k0) ? TRUE : !isConsonant(index - 1); default: return TRUE; } } /* Measure the number of consonant sequences between * `k0` and `j`. If C is a consonant sequence and V * a vowel sequence, and <..> indicates arbitrary * presence: * * gives 0 * VC gives 1 * VCVC gives 2 * VCVCVC gives 3 * .... */ static int getMeasure() { int position; int index; position = 0; index = k0; while (TRUE) { if (index > j) { return position; } if (!isConsonant(index)) { break; } index++; } index++; while (TRUE) { while (TRUE) { if (index > j) { return position; } if (isConsonant(index)) { break; } index++; } index++; position++; while (TRUE) { if (index > j) { return position; } if (!isConsonant(index)) { break; } index++; } index++; } } /* `TRUE` when `k0, ... j` contains a vowel. */ static int vowelInStem() { int index; index = k0 - 1; while (++index <= j) { if (!isConsonant(index)) { return TRUE; } } return FALSE; } /* `TRUE` when `j` and `(j-1)` are the same consonant. */ static int isDoubleConsonant(int index) { if (b[index] != b[index - 1]) { return FALSE; } return isConsonant(index); } /* `TRUE` when `i - 2, i - 1, i` has the form * `consonant - vowel - consonant` and also if the second * C is not `"w"`, `"x"`, or `"y"`. this is used when * trying to restore an `e` at the end of a short word. * * Such as: * * `cav(e)`, `lov(e)`, `hop(e)`, `crim(e)`, but `snow`, * `box`, `tray`. */ static int cvc(int index) { int character; if (index < k0 + 2 || !isConsonant(index) || isConsonant(index - 1) || !isConsonant(index - 2)) { return FALSE; } character = b[index]; if (character == 'w' || character == 'x' || character == 'y') { return FALSE; } return TRUE; } /* `ends(s)` is `TRUE` when `k0, ...k` ends with `value`. */ static int ends(const char *value) { int length = value[0]; /* Tiny speed-up. */ if (value[length] != b[k]) { return FALSE; } if (length > k - k0 + 1) { return FALSE; } if (memcmp(b + k - length + 1, value + 1, length) != 0) { return FALSE; } j = k - length; return TRUE; } /* `setTo(value)` sets `(j + 1), ...k` to the characters in * `value`, readjusting `k`. */ static void setTo(const char *value) { int length = value[0]; memmove(b + j + 1, value + 1, length); k = j + length; } /* Set string. */ static void replace(const char *value) { if (getMeasure() > 0) { setTo(value); } } /* `step1ab()` gets rid of plurals, `-ed`, `-ing`. * * Such as: * * caresses -> caress * ponies -> poni * ties -> ti * caress -> caress * cats -> cat * * feed -> feed * agreed -> agree * disabled -> disable * * matting -> mat * mating -> mate * meeting -> meet * milling -> mill * messing -> mess * * meetings -> meet */ static void step1ab() { int character; if (b[k] == 's') { if (ends("\04" "sses")) { k -= 2; } else if (ends("\03" "ies")) { setTo("\01" "i"); } else if (b[k - 1] != 's') { k--; } } if (ends("\03" "eed")) { if (getMeasure() > 0) { k--; } } else if ((ends("\02" "ed") || ends("\03" "ing")) && vowelInStem()) { k = j; if (ends("\02" "at")) { setTo("\03" "ate"); } else if (ends("\02" "bl")) { setTo("\03" "ble"); } else if (ends("\02" "iz")) { setTo("\03" "ize"); } else if (isDoubleConsonant(k)) { k--; character = b[k]; if (character == 'l' || character == 's' || character == 'z') { k++; } } else if (getMeasure() == 1 && cvc(k)) { setTo("\01" "e"); } } } /* `step1c()` turns terminal `"y"` to `"i"` when there * is another vowel in the stem. */ static void step1c() { if (ends("\01" "y") && vowelInStem()) { b[k] = 'i'; } } /* `step2()` maps double suffices to single ones. * so -ization ( = -ize plus -ation) maps to -ize etc. * note that the string before the suffix must give * getMeasure() > 0. */ static void step2() { switch (b[k - 1]) { case 'a': if (ends("\07" "ational")) { replace("\03" "ate"); break; } if (ends("\06" "tional")) { replace("\04" "tion"); break; } break; case 'c': if (ends("\04" "enci")) { replace("\04" "ence"); break; } if (ends("\04" "anci")) { replace("\04" "ance"); break; } break; case 'e': if (ends("\04" "izer")) { replace("\03" "ize"); break; } break; case 'l': /* --DEPARTURE--: To match the published algorithm, * replace this line with: * * ``` * if (ends("\04" "abli")) { * replace("\04" "able"); * * break; * } * ``` */ if (ends("\03" "bli")) { replace("\03" "ble"); break; } if (ends("\04" "alli")) { replace("\02" "al"); break; } if (ends("\05" "entli")) { replace("\03" "ent"); break; } if (ends("\03" "eli")) { replace("\01" "e"); break; } if (ends("\05" "ousli")) { replace("\03" "ous"); break; } break; case 'o': if (ends("\07" "ization")) { replace("\03" "ize"); break; } if (ends("\05" "ation")) { replace("\03" "ate"); break; } if (ends("\04" "ator")) { replace("\03" "ate"); break; } break; case 's': if (ends("\05" "alism")) { replace("\02" "al"); break; } if (ends("\07" "iveness")) { replace("\03" "ive"); break; } if (ends("\07" "fulness")) { replace("\03" "ful"); break; } if (ends("\07" "ousness")) { replace("\03" "ous"); break; } break; case 't': if (ends("\05" "aliti")) { replace("\02" "al"); break; } if (ends("\05" "iviti")) { replace("\03" "ive"); break; } if (ends("\06" "biliti")) { replace("\03" "ble"); break; } break; /* --DEPARTURE--: To match the published algorithm, delete this line. */ case 'g': if (ends("\04" "logi")) { replace("\03" "log"); break; } } } /* `step3()` deals with -ic-, -full, -ness etc. * similar strategy to step2. */ static void step3() { switch (b[k]) { case 'e': if (ends("\05" "icate")) { replace("\02" "ic"); break; } if (ends("\05" "ative")) { replace("\00" ""); break; } if (ends("\05" "alize")) { replace("\02" "al"); break; } break; case 'i': if (ends("\05" "iciti")) { replace("\02" "ic"); break; } break; case 'l': if (ends("\04" "ical")) { replace("\02" "ic"); break; } if (ends("\03" "ful")) { replace("\00" ""); break; } break; case 's': if (ends("\04" "ness")) { replace("\00" ""); break; } break; } } /* `step4()` takes off -ant, -ence etc., in * context vcvc. */ static void step4() { switch (b[k - 1]) { case 'a': if (ends("\02" "al")) { break; } return; case 'c': if (ends("\04" "ance")) { break; } if (ends("\04" "ence")) { break; } return; case 'e': if (ends("\02" "er")) { break; } return; case 'i': if (ends("\02" "ic")) { break; } return; case 'l': if (ends("\04" "able")) { break; } if (ends("\04" "ible")) { break; } return; case 'n': if (ends("\03" "ant")) { break; } if (ends("\05" "ement")) { break; } if (ends("\04" "ment")) { break; } if (ends("\03" "ent")) { break; } return; case 'o': if (ends("\03" "ion") && j >= k0 && (b[j] == 's' || b[j] == 't')) { break; } /* takes care of -ous */ if (ends("\02" "ou")) { break; } return; case 's': if (ends("\03" "ism")) { break; } return; case 't': if (ends("\03" "ate")) { break; } if (ends("\03" "iti")) { break; } return; case 'u': if (ends("\03" "ous")) { break; } return; case 'v': if (ends("\03" "ive")) { break; } return; case 'z': if (ends("\03" "ize")) { break; } return; default: return; } if (getMeasure() > 1) { k = j; } } /* `step5()` removes a final `-e` if `getMeasure()` is * greater than `1`, and changes `-ll` to `-l` if * `getMeasure()` is greater than `1`. */ static void step5() { int a; j = k; if (b[k] == 'e') { a = getMeasure(); if (a > 1 || (a == 1 && !cvc(k - 1))) { k--; } } if (b[k] == 'l' && isDoubleConsonant(k) && getMeasure() > 1) { k--; } } /* In `stem(p, i, j)`, `p` is a `char` pointer, and the * string to be stemmed is from `p[i]` to * `p[j]` (inclusive). * * Typically, `i` is zero and `j` is the offset to the * last character of a string, `(p[j + 1] == '\0')`. * The stemmer adjusts the characters `p[i]` ... `p[j]` * and returns the new end-point of the string, `k`. * * Stemming never increases word length, so `i <= k <= j`. * * To turn the stemmer into a module, declare 'stem' as * extern, and delete the remainder of this file. */ int stem(char *p, int index, int position) { /* Copy the parameters into statics. */ b = p; k = position; k0 = index; if (k <= k0 + 1) { return k; /* --DEPARTURE-- */ } /* With this line, strings of length 1 or 2 don't * go through the stemming process, although no * mention is made of this in the published * algorithm. Remove the line to match the published * algorithm. */ step1ab(); if (k > k0) { step1c(); step2(); step3(); step4(); step5(); } return k; } static int l_stmr(lua_State* L) { char* word = strdup(luaL_checkstring(L,1)); int end = stem(word, 0, strlen(word) - 1); word[end + 1] = 0; lua_pushstring(L, word); free(word); return 1; } static const struct luaL_Reg _lib [] = { {"stmr", l_stmr}, {NULL,NULL} }; int luaopen_stmr(lua_State *L) { luaL_newlib(L, _lib); return 1; }