aboutsummaryrefslogtreecommitdiffstats
path: root/netinstall/9/ja/content/search/stemmers/fr_stemmer.js
diff options
context:
space:
mode:
Diffstat (limited to 'netinstall/9/ja/content/search/stemmers/fr_stemmer.js')
-rw-r--r--netinstall/9/ja/content/search/stemmers/fr_stemmer.js299
1 files changed, 299 insertions, 0 deletions
diff --git a/netinstall/9/ja/content/search/stemmers/fr_stemmer.js b/netinstall/9/ja/content/search/stemmers/fr_stemmer.js
new file mode 100644
index 00000000..34f97431
--- /dev/null
+++ b/netinstall/9/ja/content/search/stemmers/fr_stemmer.js
@@ -0,0 +1,299 @@
+/*
+ * Author: Kasun Gajasinghe
+ * E-Mail: kasunbg AT gmail DOT com
+ * Date: 09.08.2010
+ *
+ * usage: stemmer(word);
+ * ex: var stem = stemmer(foobar);
+ * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
+ *
+ * LICENSE:
+ *
+ * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+var stemmer = function(word){
+// Letters in French include the following accented forms,
+// â à ç ë é ê è ï î ô û ù
+// The following letters are vowels:
+// a e i o u y â à ë é ê è ï î ô û ù
+
+ word = word.toLowerCase();
+ var oriWord = word;
+ word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
+ word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
+ word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
+ word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
+ word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
+
+ var rv='';
+ var rvIndex = -1;
+ if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
+ rv = word.substring(3);
+ rvIndex = 3;
+ } else {
+ rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
+ if(rvIndex != -1){
+ rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex
+ rv = word.substring(rvIndex);
+ } else {
+ rvIndex = word.length;
+ }
+ }
+
+// R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
+// R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
+ var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
+ var r1 = '';
+ if (r1Index != -1) {
+ r1Index += 2;
+ r1 = word.substring(r1Index);
+ } else {
+ r1Index = word.length;
+ }
+
+ var r2Index = -1;
+ var r2 = '';
+ if (r1Index != -1) {
+ r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
+ if (r2Index != -1) {
+ r2Index += 2;
+ r2 = r1.substring(r2Index);
+ r2Index += r1Index;
+ } else {
+ r2 = '';
+ r2Index = word.length;
+ }
+ }
+ if (r1Index != -1 && r1Index < 3) {
+ r1Index = 3;
+ r1 = word.substring(r1Index);
+ }
+
+ /*
+ Step 1: Standard suffix removal
+ */
+ var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/);
+ var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/);
+ var a3Index = word.search(/(logie|logies)$/);
+ var a4Index = word.search(/(usion|ution|usions|utions)$/);
+ var a5Index = word.search(/(ence|ences)$/);
+ var a6Index = word.search(/(ement|ements)$/);
+ var a7Index = word.search(/(ité|ités)$/);
+ var a8Index = word.search(/(if|ive|ifs|ives)$/);
+ var a9Index = word.search(/(eaux)$/);
+ var a10Index = word.search(/(aux)$/);
+ var a11Index = word.search(/(euse|euses)$/);
+ var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/);
+ var a13Index = word.search(/(amment)$/);
+ var a14Index = word.search(/(emment)$/);
+ var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/);
+
+ if(a1Index != -1 && a1Index >= r2Index){
+ word = word.substring(0,a1Index);
+ } else if(a2Index != -1 && a2Index >= r2Index){
+ word = word.substring(0,a2Index);
+ var a2Index2 = word.search(/(ic)$/);
+ if(a2Index2 != -1 && a2Index2 >= r2Index){
+ word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
+ } else { //else replace by iqU
+ word = word.replace(/(ic)$/,'iqU');
+ }
+ } else if(a3Index != -1 && a3Index >= r2Index){
+ word = word.replace(/(logie|logies)$/,'log'); //replace with log if in R2
+ } else if(a4Index != -1 && a4Index >= r2Index){
+ word = word.replace(/(usion|ution|usions|utions)$/,'u'); //replace with u if in R2
+ } else if(a5Index != -1 && a5Index >= r2Index){
+ word = word.replace(/(ence|ences)$/,'ent'); //replace with ent if in R2
+ } else if(a6Index != -1 && a6Index >= rvIndex){
+ word = word.substring(0,a6Index);
+ if(word.search(/(iv)$/) >= r2Index){
+ word = word.replace(/(iv)$/, '');
+ if(word.search(/(at)$/) >= r2Index){
+ word = word.replace(/(at)$/, '');
+ }
+ } else if(word.search(/(eus)$/) != -1){
+ var a6Index2 = word.search(/(eus)$/);
+ if(a6Index2 >=r2Index){
+ word = word.substring(0, a6Index2);
+ } else if(a6Index2 >= r1Index){
+ word = word.substring(0,a6Index2)+"eux";
+ }
+ } else if(word.search(/(abl|iqU)$/) >= r2Index){
+ word = word.replace(/(abl|iqU)$/,''); //if preceded by abl or iqU, delete if in R2,
+ } else if(word.search(/(ièr|Ièr)$/) >= rvIndex){
+ word = word.replace(/(ièr|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2,
+ }
+ } else if(a7Index != -1 && a7Index >= r2Index){
+ word = word.substring(0,a7Index); //delete if in R2
+ if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise,
+ var a7Index2 = word.search(/(abil)$/);
+ if(a7Index2 >=r2Index){
+ word = word.substring(0, a7Index2);
+ } else {
+ word = word.substring(0,a7Index2)+"abl";
+ }
+ } else if(word.search(/(ic)$/) != -1){
+ var a7Index3 = word.search(/(ic)$/);
+ if(a7Index3 != -1 && a7Index3 >= r2Index){
+ word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
+ } else { //else replace by iqU
+ word = word.replace(/(ic)$/,'iqU');
+ }
+ } else if(word.search(/(iv)$/) != r2Index){
+ word = word.replace(/(iv)$/,'');
+ }
+ } else if(a8Index != -1 && a8Index >= r2Index){
+ word = word.substring(0,a8Index);
+ if(word.search(/(at)$/) >= r2Index){
+ word = word.replace(/(at)$/, '');
+ if(word.search(/(ic)$/) >= r2Index){
+ word = word.replace(/(ic)$/, '');
+ } else { word = word.replace(/(ic)$/, 'iqU'); }
+ }
+ } else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
+ } else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
+ } else if(a11Index != -1 ){
+ var a11Index2 = word.search(/(euse|euses)$/);
+ if(a11Index2 >=r2Index){
+ word = word.substring(0, a11Index2);
+ } else if(a11Index2 >= r1Index){
+ word = word.substring(0, a11Index2)+"eux";
+ }
+ } else if(a12Index!=-1 && a12Index>=r1Index){
+ word = word.substring(0,a12Index+1); //+1- amendment to non-vowel
+ } else if(a13Index!=-1 && a13Index>=rvIndex){
+ word = word.replace(/(amment)$/,'ant');
+ } else if(a14Index!=-1 && a14Index>=rvIndex){
+ word = word.replace(/(emment)$/,'ent');
+ } else if(a15Index!=-1 && a15Index>=rvIndex){
+ word = word.substring(0,a15Index+1);
+ }
+
+ /* Step 2a: Verb suffixes beginning i*/
+ var wordStep1 = word;
+ var step2aDone = false;
+ if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){
+ step2aDone = true;
+ var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i;
+ if(word.search(b1Regex) >= rvIndex){
+ word = word.replace(b1Regex,'$1');
+ }
+ }
+
+ /* Step 2b: Other verb suffixes*/
+ if (step2aDone && wordStep1 == word) {
+ if (word.search(/(ions)$/) >= r2Index) {
+ word = word.replace(/(ions)$/, '');
+ } else {
+ var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i;
+ if (word.search(b2Regex) >= rvIndex) {
+ word = word.replace(b2Regex, '');
+ } else {
+ var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
+ if (word.search(b3Regex) >= rvIndex) {
+ word = word.replace(b3Regex, '');
+ } else {
+ var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
+ if (word.search(b3Regex2) >= rvIndex) {
+ word = word.replace(b3Regex2, '');
+ }
+ }
+ }
+ }
+ }
+
+ if(oriWord != word.toLowerCase()){
+ /* Step 3 */
+ var rep = '';
+ if(word.search(/Y$/) != -1) {
+ word = word.replace(/Y$/, 'i');
+ } else if(word.search(/ç$/) != -1){
+ word = word.replace(/ç$/, 'c');
+ }
+ } else {
+ /* Step 4 */
+ //If the word ends s, not preceded by a, i, o, u, è or s, delete it.
+ if (word.search(/([^aiouès])s$/) >= rvIndex) {
+ word = word.replace(/([^aiouès])s$/, '$1');
+ }
+ var e1Index = word.search(/ion$/);
+ if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
+ word = word.substring(0, e1Index);
+ } else {
+ var e2Index = word.search(/(ier|ière|Ier|Ière)$/);
+ if (e2Index != -1 && e2Index >= rvIndex) {
+ word = word.substring(0, e2Index) + "i";
+ } else {
+ if (word.search(/e$/) >= rvIndex) {
+ word = word.replace(/e$/, ''); //delete last e
+ } else if (word.search(/guë$/) >= rvIndex) {
+ word = word.replace(/guë$/, 'gu');
+ }
+ }
+ }
+ }
+
+ /* Step 5: Undouble */
+ //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1');
+ word = word.replace(/(en|on)(n)$/,'$1');
+ word = word.replace(/(ett)$/,'et');
+ word = word.replace(/(el|eil)(l)$/,'$1');
+
+ /* Step 6: Un-accent */
+ word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
+ word = word.toLowerCase();
+ return word;
+};
+
+var eqOut = new Array();
+var noteqOut = new Array();
+var eqCount = 0;
+/*
+To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
+Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
+compare the output with COut.
+ (I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
+ will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
+ */
+function testFr(){
+ var start = new Date().getTime(); //execution time
+ eqCount = 0;
+ eqOut = new Array();
+ noteqOut = new Array();
+ for(var k=0;k<voc.length;k++){
+ if(COut[k]==stemmer(voc[k])){
+ eqCount++;
+ eqOut.push("v: "+voc[k]+" c: "+COut[k]);
+ } else {
+ noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
+ }
+ }
+ var end = new Date().getTime(); //execution time
+ var time = end-start;
+ alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
+ //console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
+}
+
+
class="hl kwb">$s, @para) = @_; sprintf(translate($s), @para); } sub N_ { $_[0] } sub makedev { ($_[0] << 8) | $_[1] } sub unmakedev { $_[0] >> 8, $_[0] & 0xff } sub translate_real { my ($s, $o_plural, $o_nb) = @_; $s or return ''; my $s2; foreach (@::textdomains, 'libDrakX') { if ($o_plural) { $s2 = Locale::gettext::dngettext($_, $s, $o_plural, $o_nb); } else { $s2 = Locale::gettext::dgettext($_, $s); } # when utf8 pragma is in use, Locale::gettext() returns an utf8 string not tagged as such: c::set_tagged_utf8($s2) if !utf8::is_utf8($s2) && utf8::is_utf8($s); return $s2 if $s ne $s2 && $s2 ne $o_plural; } # didn't lookup anything or locale is "C": $s2; } sub remove_translate_context { my ($s) = @_; #- translation with context, kde-like $s =~ s/^_:.*(?:\n)?//g; $s; } sub translate { my $s = translate_real(@_); $::one_message_has_been_translated ||= join(':', (caller(1))[1,2]); #- see mygtk2.pm remove_translate_context($s); } sub from_utf8 { my ($s) = @_; Locale::gettext::iconv($s, "utf-8", undef); #- undef = locale charmap = nl_langinfo(CODESET) } sub to_utf8 { my ($s) = @_; my $str = Locale::gettext::iconv($s, undef, "utf-8"); #- undef = locale charmap = nl_langinfo(CODESET) c::set_tagged_utf8($str); $str; } #- This is needed because text printed by Gtk2 will always be encoded #- in UTF-8; #- we first check if LC_ALL is defined, because if it is, changing #- only LC_COLLATE will have no effect. sub set_l10n_sort() { my $collation_locale = $ENV{LC_ALL}; if (!$collation_locale) { $collation_locale = c::setlocale(c::LC_COLLATE()); $collation_locale =~ /UTF-8/ or c::setlocale(c::LC_COLLATE(), "$collation_locale.UTF-8"); } } sub setVirtual { my ($vt_number) = @_; my $vt = ''; sysopen(my $C, "/dev/console", 2) or die "failed to open /dev/console: $!"; ioctl($C, c::VT_GETSTATE(), $vt) && ioctl($C, c::VT_ACTIVATE(), $vt_number) && ioctl($C, c::VT_WAITACTIVE(), $vt_number) or die "setVirtual failed"; unpack "S", $vt; } sub nonblock { my ($F) = @_; fcntl($F, c::F_SETFL(), fcntl($F, c::F_GETFL(), 0) | c::O_NONBLOCK()) or die "cannot fcntl F_SETFL: $!"; } #- return a size in sector #- ie MB(1) is 2048 sectors, which is 1MB sub MB { my ($nb_MB) = @_; $nb_MB * 2048; } sub removeXiBSuffix { local $_ = shift; /(\d+)\s*kB?$/i and return $1 * 1024; /(\d+)\s*MB?$/i and return $1 * 1024 * 1024; /(\d+)\s*GB?$/i and return $1 * 1024 * 1024 * 1024; /(\d+)\s*TB?$/i and return $1 * 1024 * 1024 * 1024 * 1024; $_; } sub formatXiB { my ($newnb, $o_newbase) = @_; my $newbase = $o_newbase || 1; my $sign = $newnb < 0 ? -1 : 1; $newnb = abs(int($newnb)); my ($nb, $base); my $decr = sub { ($nb, $base) = ($newnb, $newbase); $base >= 1024 ? ($newbase = $base / 1024) : ($newnb = $nb / 1024); }; my $suffix; foreach (N("B"), N("KB"), N("MB"), N("GB"), N("TB")) { $decr->(); if ($newnb < 1 && $newnb * $newbase < 1) { $suffix = $_; last; } } my $v = $nb * $base; my $s = $v < 10 && int(10 * $v - 10 * int($v)); int($v * $sign) . ($s ? "." . abs($s) : '') . ($suffix || N("TB")); } sub formatTime { my ($s, $m, $h) = gmtime($_[0]); if ($h) { sprintf "%02d:%02d", $h, $m; } elsif ($m > 1) { N("%d minutes", $m); } elsif ($m == 1) { N("1 minute"); } else { N("%d seconds", $s); } } sub expand_symlinks_with_absolute_symlinks_in_prefix { my ($prefix, $link) = @_; my ($first, @l) = split '/', $link; $first eq '' or die "expand_symlinks: $link is relative\n"; my ($f, $l); foreach (@l) { $f .= "/$_"; while ($l = readlink "$prefix$f") { $f = $l =~ m!^/! ? $l : MDK::Common::File::concat_symlink($f, "../$l"); } } "$prefix$f"; } sub expand_symlinks_but_simple { my ($f) = @_; my $link = readlink($f); my $f2 = expand_symlinks($f); if ($link && $link !~ m|/|) { # put back the last simple symlink $f2 =~ s|\Q$link\E$|basename($f)|e; } $f2; } sub sync { &MDK::Common::System::sync } BEGIN { undef *formatError } sub formatError { my ($err) = @_; ref($err) eq 'SCALAR' and $err = $$err; log::l("error: $err"); &MDK::Common::String::formatError($err); } sub group_by(&@) { my $f = shift; @_ or return; my $e = shift; my @l = my $last_l = [$e]; foreach (@_) { if ($f->($e, $_)) { push @$last_l, $_; } else { push @l, $last_l = [$_]; $e = $_; } } @l; } # Group the list by n. Returns a reference of lists of length n sub group_n_lm { my $n = shift; my @l; push @l, [ splice(@_, 0, $n) ] while @_; @l; } sub join_lines { my @l; my $s; foreach (@_) { if (/^\s/) { $s .= $_; } else { push @l, $s if $s; $s = $_; } } @l, if_($s, $s); } sub read_alternative { my ($name) = @_; my $alt = readlink("$::prefix/etc/alternatives/$name"); $alt && $::prefix . $alt; } sub set_alternative { my ($command, $executable) = @_; #- check the existance of $executable as an alternative for $command