Index: C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java =================================================================== --- C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java (revision 518792) +++ C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java (working copy) @@ -229,8 +229,8 @@ /** * 1) Turn to lowercase * 2) Remove accents - * 3) ã -> a ; õ -> o - * 4) ç -> c + * 3) \u00E3 -> a ; \u00F5 -> o + * 4) \u00E7 -> c * * @return null or a string transformed */ @@ -245,31 +245,31 @@ value = value.toLowerCase() ; for (j=0 ; j < value.length() ; j++) { - if ((value.charAt(j) == 'á') || - (value.charAt(j) == 'â') || - (value.charAt(j) == 'ã')) { + if ((value.charAt(j) == '\u00E1') || + (value.charAt(j) == '\u00E2') || + (value.charAt(j) == '\u00E3')) { r= r + "a" ; continue ; } - if ((value.charAt(j) == 'é') || - (value.charAt(j) == 'ê')) { + if ((value.charAt(j) == '\u00E9') || + (value.charAt(j) == '\u00EA')) { r= r + "e" ; continue ; } - if (value.charAt(j) == 'í') { + if (value.charAt(j) == '\u00ED') { r= r + "i" ; continue ; } - if ((value.charAt(j) == 'ó') || - (value.charAt(j) == 'ô') || - (value.charAt(j) == 'õ')) { + if ((value.charAt(j) == '\u00F3') || + (value.charAt(j) == '\u00F4') || + (value.charAt(j) == '\u00F5')) { r= r + "o" ; continue ; } - if ((value.charAt(j) == 'ú') || - (value.charAt(j) == 'ü')) { + if ((value.charAt(j) == '\u00FA') || + (value.charAt(j) == '\u00FC')) { r= r + "u" ; continue ; } - if (value.charAt(j) == 'ç') { + if (value.charAt(j) == '\u00E7') { r= r + "c" ; continue ; } - if (value.charAt(j) == 'ñ') { + if (value.charAt(j) == '\u00F1') { r= r + "n" ; continue ; } @@ -356,7 +356,7 @@ } /** - * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. + * Creates CT (changed term) , substituting * '\u00E3' and '\u00F5' for 'a~' and 'o~'. */ private void createCT( String term ) { CT = changeTerm(term) ; @@ -954,7 +954,7 @@ /** * Residual suffix * - * If the word ends with one of the suffixes (os a i o á í ó) + * If the word ends with one of the suffixes (os a i o \u00E1 \u00ED \u00F3) * in RV, delete it * */ @@ -977,11 +977,11 @@ } /** - * If the word ends with one of ( e é ê) in RV,delete it, + * If the word ends with one of ( e \u00E9 \u00EA) in RV,delete it, * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV, * delete the 'u' (or 'i') * - * Or if the word ends ç remove the cedilha + * Or if the word ends \u00E7 remove the cedilha * */ private void step5() { Index: C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java =================================================================== --- C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (revision 518792) +++ C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (working copy) @@ -170,9 +170,9 @@ deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); deleteFrom( RV, new String[] { "ements", "ement" } ); - deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); - deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); - deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); + deleteButSuffixFromElseReplace( R2, new String[] { "it\u00E9s", "it\u00E9" }, "abil", false, R0, "abl" ); + deleteButSuffixFromElseReplace( R2, new String[] { "it\u00E9s", "it\u00E9" }, "ic", false, R0, "iqU" ); + deleteButSuffixFrom( R2, new String[] { "it\u00E9s", "it\u00E9" }, "iv", true ); String[] autre = { "ifs", "ives", "if", "ive" }; deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); @@ -209,11 +209,11 @@ * @return boolean - true if something changed in the StringBuffer */ private boolean step2a() { - String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", + String[] search = { "\u00EEmes", "\u00EEtes", "iraIent", "irait", "irais", "irai", "iras", "ira", "irent", "iriez", "irez", "irions", "irons", "iront", "issaIent", "issais", "issantes", "issante", "issants", "issant", "issait", "issais", "issions", "issons", "issiez", "issez", "issent", - "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; + "isses", "isse", "ir", "is", "\u00EEt", "it", "ies", "ie", "i" }; return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); } @@ -224,13 +224,13 @@ */ private void step2b() { String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", - "erons", "eront","erez", "èrent", "era", "ées", "iez", - "ée", "és", "er", "ez", "é" }; + "erons", "eront","erez", "\u00E8rent", "era", "\u00E9es", "iez", + "\u00E9e", "\u00E9s", "er", "ez", "\u00E9" }; deleteFrom( RV, suffix ); String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", - "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", - "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; + "antes", "aIent", "Aient", "ante", "\u00E2mes", "\u00E2tes", "ants", "ant", + "ait", "a\u00EEt", "ais", "Ait", "A\u00EEt", "Ais", "\u00E2t", "as", "ai", "Ai", "a" }; deleteButSuffixFrom( RV, search, "e", true ); deleteFrom( R2, new String[] { "ions" } ); @@ -249,7 +249,7 @@ sb.setCharAt( sb.length()-1, 'i' ); setStrings(); } - else if (ch == 'ç') + else if (ch == '\u00E7') { sb.setCharAt( sb.length()-1, 'c' ); setStrings(); @@ -268,7 +268,7 @@ if (ch == 's') { char b = sb.charAt( sb.length()-2 ); - if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') + if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != '\u00E8' && b != 's') { sb.delete( sb.length() - 1, sb.length()); setStrings(); @@ -279,9 +279,9 @@ if (!found) found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); - replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); + replaceFrom( RV, new String[] { "I\u00E8re", "i\u00E8re", "Ier", "ier" }, "i" ); deleteFrom( RV, new String[] { "e" } ); - deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" ); + deleteFromIfPrecededIn( RV, new String[] { "\u00EB" }, R0, "gu" ); } /** @@ -316,7 +316,7 @@ { if (!seenVowel) { - if (ch == 'é' || ch == 'è') + if (ch == '\u00E9' || ch == '\u00E8') { pos = i; break; @@ -530,18 +530,18 @@ case 'o': case 'u': case 'y': - case 'â': - case 'à': - case 'ë': - case 'é': - case 'ê': - case 'è': - case 'ï': - case 'î': - case 'ô': - case 'ü': - case 'ù': - case 'û': + case '\u00E2': + case '\u00E0': + case '\u00EB': + case '\u00E9': + case '\u00EA': + case '\u00E8': + case '\u00EF': + case '\u00EE': + case '\u00F4': + case '\u00FC': + case '\u00F9': + case '\u00FB': return true; default: return false; Index: C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java =================================================================== --- C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java (revision 518792) +++ C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java (working copy) @@ -155,12 +155,12 @@ /** * Do some substitutions for the term to reduce overstemming: * - * - Substitute Umlauts with their corresponding vowel: äöü -> aou, - * "ß" is substituted by "ss" + * - Substitute Umlauts with their corresponding vowel: \u00E4\u00F6\u00FC -> aou, + * "\u00DF" is substituted by "ss" * - Substitute a second char of a pair of equal characters with * an asterisk: ?? -> ?* * - Substitute some common character combinations with a token: - * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + * sch/ch/ei/ie/ig/st -> $/\u00A7/%/&/#/! */ private void substitute( StringBuffer buffer ) { @@ -171,17 +171,17 @@ buffer.setCharAt( c, '*' ); } // Substitute Umlauts. - else if ( buffer.charAt( c ) == 'ä' ) { + else if ( buffer.charAt( c ) == '\u00E4' ) { buffer.setCharAt( c, 'a' ); } - else if ( buffer.charAt( c ) == 'ö' ) { + else if ( buffer.charAt( c ) == '\u00F6' ) { buffer.setCharAt( c, 'o' ); } - else if ( buffer.charAt( c ) == 'ü' ) { + else if ( buffer.charAt( c ) == '\u00FC' ) { buffer.setCharAt( c, 'u' ); } - // Fix bug so that 'ß' at the end of a word is replaced. - else if ( buffer.charAt( c ) == 'ß' ) { + // Fix bug so that '\u00DF' at the end of a word is replaced. + else if ( buffer.charAt( c ) == '\u00DF' ) { buffer.setCharAt( c, 's' ); buffer.insert( c + 1, 's' ); substCount++; @@ -197,7 +197,7 @@ substCount =+ 2; } else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { - buffer.setCharAt( c, '§' ); + buffer.setCharAt( c, '\u00A7' ); buffer.deleteCharAt( c + 1 ); substCount++; } @@ -228,7 +228,7 @@ /** * Undoes the changes made by substitute(). That are character pairs and * character combinations. Umlauts will remain as their corresponding vowel, - * as "ß" remains as "ss". + * as "\u00DF" remains as "ss". */ private void resubstitute( StringBuffer buffer ) { @@ -241,7 +241,7 @@ buffer.setCharAt( c, 's' ); buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); } - else if ( buffer.charAt( c ) == '§' ) { + else if ( buffer.charAt( c ) == '\u00A7' ) { buffer.setCharAt( c, 'c' ); buffer.insert( c + 1, 'h' ); } Index: C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java =================================================================== --- C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (revision 518792) +++ C:/Workspace/lucene-trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (working copy) @@ -260,37 +260,37 @@ } /** - * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú + * Substitute \u00E4, \u00EB, \u00EF, \u00F6, \u00FC, \u00E1 , \u00E9, \u00ED, \u00F3, \u00FA */ private void substitute(StringBuffer buffer) { for (int i = 0; i < buffer.length(); i++) { switch (buffer.charAt(i)) { - case 'ä': - case 'á': + case '\u00E4': + case '\u00E1': { buffer.setCharAt(i, 'a'); break; } - case 'ë': - case 'é': + case '\u00EB': + case '\u00E9': { buffer.setCharAt(i, 'e'); break; } - case 'ü': - case 'ú': + case '\u00FC': + case '\u00FA': { buffer.setCharAt(i, 'u'); break; } - case 'ï': + case '\u00EF': case 'i': { buffer.setCharAt(i, 'i'); break; } - case 'ö': - case 'ó': + case '\u00F6': + case '\u00F3': { buffer.setCharAt(i, 'o'); break; @@ -392,7 +392,7 @@ case 'i': case 'u': case 'y': - case 'è': + case '\u00E8': { return true; }