Index: contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 0) @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis.br; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Test the Brazilian Stem Filter, which only modifies the term text. + * + * It is very similar to the snowball portuguese algorithm but not exactly the same. + * + */ +public class TestBrazilianStemmer extends TestCase { + + public void testWithSnowballExamples() throws IOException { + check("boa", "boa"); + check("boainain", "boainain"); + check("boas", "boas"); + check("bôas", "boas"); // removes diacritic: different from snowball portugese + check("boassu", "boassu"); + check("boataria", "boat"); + check("boate", "boat"); + check("boates", "boat"); + check("boatos", "boat"); + check("bob", "bob"); + check("boba", "bob"); + check("bobagem", "bobag"); + check("bobagens", "bobagens"); + check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese + check("bobear", "bob"); + check("bobeira", "bobeir"); + check("bobinho", "bobinh"); + check("bobinhos", "bobinh"); + check("bobo", "bob"); + check("bobs", "bobs"); + check("boca", "boc"); + check("bocadas", "boc"); + check("bocadinho", "bocadinh"); + check("bocado", "boc"); + check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese + check("boçal", "bocal"); // removes diacritic: different from snowball portuguese + check("bocarra", "bocarr"); + check("bocas", "boc"); + check("bode", "bod"); + check("bodoque", "bodoqu"); + check("body", "body"); + check("boeing", "boeing"); + check("boem", "boem"); + check("boemia", "boem"); + check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese + check("bogotá", "bogot"); + check("boi", "boi"); + check("bóia", "boi"); // removes diacritic: different from snowball portuguese + check("boiando", "boi"); + check("quiabo", "quiab"); + check("quicaram", "quic"); + check("quickly", "quickly"); + check("quieto", "quiet"); + check("quietos", "quiet"); + check("quilate", "quilat"); + check("quilates", "quilat"); + check("quilinhos", "quilinh"); + check("quilo", "quil"); + check("quilombo", "quilomb"); + check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese + check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese + check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese + check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese + check("quilos", "quil"); + check("quimica", "quimic"); + check("quilos", "quil"); + check("quimica", "quimic"); + check("quimicas", "quimic"); + check("quimico", "quimic"); + check("quimicos", "quimic"); + check("quimioterapia", "quimioterap"); + check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese + check("quimono", "quimon"); + check("quincas", "quinc"); + check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese + check("quinhentos", "quinhent"); + check("quinn", "quinn"); + check("quino", "quin"); + check("quinta", "quint"); + check("quintal", "quintal"); + check("quintana", "quintan"); + check("quintanilha", "quintanilh"); + check("quintão", "quinta"); // removes diacritic: different from snowball portoguese + check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent' + check("quintino", "quintin"); + check("quinto", "quint"); + check("quintos", "quint"); + check("quintuplicou", "quintuplic"); + check("quinze", "quinz"); + check("quinzena", "quinzen"); + check("quiosque", "quiosqu"); + } + + + private void check(final String input, final String expected) throws IOException { + Analyzer analyzer = new BrazilianAnalyzer(); + TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + Token nextToken = stream.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + assertTrue(stream.next(nextToken) == null); + stream.close(); + } + +} \ No newline at end of file Index: contrib/analyzers/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (revision 0) @@ -0,0 +1,133 @@ +package org.apache.lucene.analysis.nl; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Test the Dutch Stem Filter, which only modifies the term text. + * + * The code states that it uses the snowball algorithm, but tests reveal some differences. + * + */ +public class TestDutchStemmer extends TestCase { + + public void testWithSnowballExamples() throws IOException { + check("lichaamsziek", "lichaamsziek"); + check("lichamelijk", "licham"); + check("lichamelijke", "licham"); + check("lichamelijkheden", "licham"); + check("lichamen", "licham"); + check("lichere", "licher"); + check("licht", "licht"); + check("lichtbeeld", "lichtbeeld"); + check("lichtbruin", "lichtbruin"); + check("lichtdoorlatende", "lichtdoorlat"); + check("lichte", "licht"); + check("lichten", "licht"); + check("lichtende", "lichtend"); + check("lichtenvoorde", "lichtenvoord"); + check("lichter", "lichter"); + check("lichtere", "lichter"); + check("lichters", "lichter"); + check("lichtgevoeligheid", "lichtgevoel"); + check("lichtgewicht", "lichtgewicht"); + check("lichtgrijs", "lichtgrijs"); + check("lichthoeveelheid", "lichthoevel"); + check("lichtintensiteit", "lichtintensiteit"); + check("lichtje", "lichtj"); + check("lichtjes", "lichtjes"); + check("lichtkranten", "lichtkrant"); + check("lichtkring", "lichtkring"); + check("lichtkringen", "lichtkring"); + check("lichtregelsystemen", "lichtregelsystem"); + check("lichtste", "lichtst"); + check("lichtstromende", "lichtstrom"); + check("lichtte", "licht"); + check("lichtten", "licht"); + check("lichttoetreding", "lichttoetred"); + check("lichtverontreinigde", "lichtverontreinigd"); + check("lichtzinnige", "lichtzinn"); + check("lid", "lid"); + check("lidia", "lidia"); + check("lidmaatschap", "lidmaatschap"); + check("lidstaten", "lidstat"); + check("lidvereniging", "lidveren"); + check("opgingen", "opging"); + check("opglanzing", "opglanz"); + check("opglanzingen", "opglanz"); + check("opglimlachten", "opglimlacht"); + check("opglimpen", "opglimp"); + check("opglimpende", "opglimp"); + check("opglimping", "opglimp"); + check("opglimpingen", "opglimp"); + check("opgraven", "opgrav"); + check("opgrijnzen", "opgrijnz"); + check("opgrijzende", "opgrijz"); + check("opgroeien", "opgroei"); + check("opgroeiende", "opgroei"); + check("opgroeiplaats", "opgroeiplat"); + check("ophaal", "ophal"); + check("ophaaldienst", "ophaaldienst"); + check("ophaalkosten", "ophaalkost"); + check("ophaalsystemen", "ophaalsystem"); + check("ophaalt", "ophaalt"); + check("ophaaltruck", "ophaaltruck"); + check("ophalen", "ophal"); + check("ophalend", "ophal"); + check("ophalers", "ophaler"); + check("ophef", "ophef"); + check("opheffen", "ophef"); // versus snowball 'opheff' + check("opheffende", "ophef"); // versus snowball 'opheff' + check("opheffing", "ophef"); // versus snowball 'opheff' + check("opheldering", "ophelder"); + check("ophemelde", "ophemeld"); + check("ophemelen", "ophemel"); + check("opheusden", "opheusd"); + check("ophief", "ophief"); + check("ophield", "ophield"); + check("ophieven", "ophiev"); + check("ophoepelt", "ophoepelt"); + check("ophoog", "ophog"); + check("ophoogzand", "ophoogzand"); + check("ophopen", "ophop"); + check("ophoping", "ophop"); + check("ophouden", "ophoud"); + } + + + private void check(final String input, final String expected) throws IOException { + Analyzer analyzer = new DutchAnalyzer(); + TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + Token nextToken = stream.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + assertTrue(stream.next(nextToken) == null); + stream.close(); + } + +} \ No newline at end of file Index: contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 784768) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy) @@ -30,8 +30,57 @@ */ public class TestThaiAnalyzer extends TestCase { - - public void assertAnalyzesTo(Analyzer a, String input, String[] output) + + /* + * This is 17 characters of text. + * The end offsets are incorrect, which breaks highlighting, etc. + * + * Fix is easy: ThaiWordFilter.java line 50, change: + * reusableToken.setEndOffset(thaiToken.endOffset()+end); + * to: + * reusableToken.setEndOffset(thaiToken.startOffset()+end); + */ + public void testBuggyOffsets() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"}, + new int[] { 0, 2, 7, 9, 12 }, + new int[] { 2, 24, 26, 29, 34}); + } + + /* correct testcase + public void testOffsets() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"}, + new int[] { 0, 2, 7, 9, 12 }, + new int[] { 2, 7, 9, 12, 17}); + } + */ + + /* + * Thai numeric tokens are typed as instead of . + * This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer. + * + * The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc) + * Fix is easy: refine this spec to exclude thai punctuation and digits. + * + * A better fix, that would also fix quite a few other languages would be to remove the thai hack. + * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks! + */ + public void testBuggyTokenType() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, + new String[] { "", "", "", "", "", "" }); + } + + /* correct testcase + public void testTokenType() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, + new String[] { "", "", "", "", "", "" }); + } + */ + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); @@ -40,10 +89,28 @@ Token nextToken = ts.next(reusableToken); assertNotNull(nextToken); assertEquals(nextToken.term(), output[i]); + if (startOffsets != null) + assertEquals(nextToken.startOffset(), startOffsets[i]); + if (endOffsets != null) + assertEquals(nextToken.endOffset(), endOffsets[i]); + if (types != null) + assertEquals(nextToken.type(), types[i]); } assertNull(ts.next(reusableToken)); ts.close(); } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { + assertAnalyzesTo(a, input, output, null, null, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception { + assertAnalyzesTo(a, input, output, null, null, types); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null); + } public void testAnalyzer() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer();