aboutsummaryrefslogtreecommitdiffstats
path: root/installer/3/uk/content/search/stemmers/en_stemmer.js
diff options
context:
space:
mode:
Diffstat (limited to 'installer/3/uk/content/search/stemmers/en_stemmer.js')
-rw-r--r--installer/3/uk/content/search/stemmers/en_stemmer.js187
1 files changed, 187 insertions, 0 deletions
diff --git a/installer/3/uk/content/search/stemmers/en_stemmer.js b/installer/3/uk/content/search/stemmers/en_stemmer.js
new file mode 100644
index 00000000..f58012f2
--- /dev/null
+++ b/installer/3/uk/content/search/stemmers/en_stemmer.js
@@ -0,0 +1,187 @@
+// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
+// paper, in
+//
+// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+// no. 3, pp 130-137,
+//
+// see also http://www.tartarus.org/~martin/PorterStemmer
+
+// Release 1 be 'andargor', Jul 2004
+// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
+
+
+var stemmer = (function(){
+ var step2list = {
+ "ational" : "ate",
+ "tional" : "tion",
+ "enci" : "ence",
+ "anci" : "ance",
+ "izer" : "ize",
+ "bli" : "ble",
+ "alli" : "al",
+ "entli" : "ent",
+ "eli" : "e",
+ "ousli" : "ous",
+ "ization" : "ize",
+ "ation" : "ate",
+ "ator" : "ate",
+ "alism" : "al",
+ "iveness" : "ive",
+ "fulness" : "ful",
+ "ousness" : "ous",
+ "aliti" : "al",
+ "iviti" : "ive",
+ "biliti" : "ble",
+ "logi" : "log"
+ },
+
+ step3list = {
+ "icate" : "ic",
+ "ative" : "",
+ "alize" : "al",
+ "iciti" : "ic",
+ "ical" : "ic",
+ "ful" : "",
+ "ness" : ""
+ },
+
+ c = "[^aeiou]", // consonant
+ v = "[aeiouy]", // vowel
+ C = c + "[^aeiouy]*", // consonant sequence
+ V = v + "[aeiou]*", // vowel sequence
+
+ mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
+ meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
+ mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
+ s_v = "^(" + C + ")?" + v; // vowel in stem
+
+ return function (w) {
+ var stem,
+ suffix,
+ firstch,
+ re,
+ re2,
+ re3,
+ re4,
+ origword = w;
+
+ if (w.length < 3) { return w; }
+
+ firstch = w.substr(0,1);
+ if (firstch == "y") {
+ w = firstch.toUpperCase() + w.substr(1);
+ }
+
+ // Step 1a
+ re = /^(.+?)(ss|i)es$/;
+ re2 = /^(.+?)([^s])s$/;
+
+ if (re.test(w)) { w = w.replace(re,"$1$2"); }
+ else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
+
+ // Step 1b
+ re = /^(.+?)eed$/;
+ re2 = /^(.+?)(ed|ing)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ re = new RegExp(mgr0);
+ if (re.test(fp[1])) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1];
+ re2 = new RegExp(s_v);
+ if (re2.test(stem)) {
+ w = stem;
+ re2 = /(at|bl|iz)$/;
+ re3 = new RegExp("([^aeiouylsz])\\1$");
+ re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re2.test(w)) { w = w + "e"; }
+ else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
+ else if (re4.test(w)) { w = w + "e"; }
+ }
+ }
+
+ // Step 1c
+ re = /^(.+?)y$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(s_v);
+ if (re.test(stem)) { w = stem + "i"; }
+ }
+
+ // Step 2
+ re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem)) {
+ w = stem + step2list[suffix];
+ }
+ }
+
+ // Step 3
+ re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = new RegExp(mgr0);
+ if (re.test(stem)) {
+ w = stem + step3list[suffix];
+ }
+ }
+
+ // Step 4
+ re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+ re2 = /^(.+?)(s|t)(ion)$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ if (re.test(stem)) {
+ w = stem;
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1] + fp[2];
+ re2 = new RegExp(mgr1);
+ if (re2.test(stem)) {
+ w = stem;
+ }
+ }
+
+ // Step 5
+ re = /^(.+?)e$/;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = new RegExp(mgr1);
+ re2 = new RegExp(meq1);
+ re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+ if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
+ w = stem;
+ }
+ }
+
+ re = /ll$/;
+ re2 = new RegExp(mgr1);
+ if (re.test(w) && re2.test(w)) {
+ re = /.$/;
+ w = w.replace(re,"");
+ }
+
+ // and turn initial Y back to y
+
+ if (firstch == "y") {
+ w = firstch.toLowerCase() + w.substr(1);
+ }
+
+ return w;
+ }
+})(); \ No newline at end of file