Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (revision 784768) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (working copy) @@ -58,7 +58,7 @@ private WordSegmenter wordSegment; public SmartChineseAnalyzer() { - this(false); + this(true); } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 784768) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -47,7 +47,7 @@ if (end != BreakIterator.DONE) { reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start); reusableToken.setStartOffset(thaiToken.startOffset()+start); - reusableToken.setEndOffset(thaiToken.endOffset()+end); + reusableToken.setEndOffset(thaiToken.startOffset()+end); return reusableToken; } thaiToken = null; Index: contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (revision 0) @@ -0,0 +1,134 @@ +package org.apache.lucene.analysis.br; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Test the Brazilian Stem Filter, which only modifies the term text. + * + * It is very similar to the snowball portuguese algorithm but not exactly the same. + * + */ +public class TestBrazilianStemmer extends TestCase { + + public void testWithSnowballExamples() throws IOException { + check("boa", "boa"); + check("boainain", "boainain"); + check("boas", "boas"); + check("bôas", "boas"); // removes diacritic: different from snowball portugese + check("boassu", "boassu"); + check("boataria", "boat"); + check("boate", "boat"); + check("boates", "boat"); + check("boatos", "boat"); + check("bob", "bob"); + check("boba", "bob"); + check("bobagem", "bobag"); + check("bobagens", "bobagens"); + check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese + check("bobear", "bob"); + check("bobeira", "bobeir"); + check("bobinho", "bobinh"); + check("bobinhos", "bobinh"); + check("bobo", "bob"); + check("bobs", "bobs"); + check("boca", "boc"); + check("bocadas", "boc"); + check("bocadinho", "bocadinh"); + check("bocado", "boc"); + check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese + check("boçal", "bocal"); // removes diacritic: different from snowball portuguese + check("bocarra", "bocarr"); + check("bocas", "boc"); + check("bode", "bod"); + check("bodoque", "bodoqu"); + check("body", "body"); + check("boeing", "boeing"); + check("boem", "boem"); + check("boemia", "boem"); + check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese + check("bogotá", "bogot"); + check("boi", "boi"); + check("bóia", "boi"); // removes diacritic: different from snowball portuguese + check("boiando", "boi"); + check("quiabo", "quiab"); + check("quicaram", "quic"); + check("quickly", "quickly"); + check("quieto", "quiet"); + check("quietos", "quiet"); + check("quilate", "quilat"); + check("quilates", "quilat"); + check("quilinhos", "quilinh"); + check("quilo", "quil"); + check("quilombo", "quilomb"); + check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese + check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese + check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese + check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese + check("quilos", "quil"); + check("quimica", "quimic"); + check("quilos", "quil"); + check("quimica", "quimic"); + check("quimicas", "quimic"); + check("quimico", "quimic"); + check("quimicos", "quimic"); + check("quimioterapia", "quimioterap"); + check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese + check("quimono", "quimon"); + check("quincas", "quinc"); + check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese + check("quinhentos", "quinhent"); + check("quinn", "quinn"); + check("quino", "quin"); + check("quinta", "quint"); + check("quintal", "quintal"); + check("quintana", "quintan"); + check("quintanilha", "quintanilh"); + check("quintão", "quinta"); // removes diacritic: different from snowball portoguese + check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent' + check("quintino", "quintin"); + check("quinto", "quint"); + check("quintos", "quint"); + check("quintuplicou", "quintuplic"); + check("quinze", "quinz"); + check("quinzena", "quinzen"); + check("quiosque", "quiosqu"); + } + + + private void check(final String input, final String expected) throws IOException { + Analyzer analyzer = new BrazilianAnalyzer(); + TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + Token nextToken = stream.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + assertTrue(stream.next(nextToken) == null); + stream.close(); + } + +} \ No newline at end of file Index: contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 784768) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -31,7 +31,27 @@ import org.apache.lucene.analysis.TokenStream; public class TestSmartChineseAnalyzer extends TestCase { - + + public void testChineseStopWordsDefault() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = "我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装" }; + assertAnalyzesTo(ca, sentence, result); + } + + /* + * Punctuation is handled in a strange way if you disable stopwords + * In this example the IDEOGRAPHIC FULL STOP is converted into a comma. + * if you don't supply (true) to the constructor, or use a different stopwords list, + * then punctuation is indexed. + */ + public void testChineseStopWordsOff() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */ + String sentence = "我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装", "," }; + assertAnalyzesTo(ca, sentence, result); + } + public void testChineseAnalyzer() throws IOException { Token nt = new Token(); Analyzer ca = new SmartChineseAnalyzer(true); @@ -47,7 +67,55 @@ } ts.close(); } + + /* + * English words are lowercased and porter-stemmed. + */ + public void testMixedLatinChinese() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装", + new String[] { "我", "购买", "test", "了", "道具", "和", "服装"}); + } + + public void testOffsets() throws Exception { + assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装", + new String[] { "我", "购买", "了", "道具", "和", "服装" }, + new int[] { 0, 1, 3, 4, 6, 7 }, + new int[] { 1, 3, 4, 6, 7, 9 }); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) + throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + for (int i = 0; i < output.length; i++) { + Token nextToken = ts.next(reusableToken); + assertNotNull(nextToken); + assertEquals(nextToken.term(), output[i]); + if (startOffsets != null) + assertEquals(nextToken.startOffset(), startOffsets[i]); + if (endOffsets != null) + assertEquals(nextToken.endOffset(), endOffsets[i]); + if (types != null) + assertEquals(nextToken.type(), types[i]); + } + assertNull(ts.next(reusableToken)); + ts.close(); +} + +public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { + assertAnalyzesTo(a, input, output, null, null, null); +} + +public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception { + assertAnalyzesTo(a, input, output, null, null, types); +} + +public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null); +} + + /** * @param args * @throws IOException Index: contrib/analyzers/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 0) @@ -0,0 +1,51 @@ +package org.apache.lucene.analysis.cz; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Test the CzechAnalyzer + * + * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list. + * + */ +public class TestCzechAnalyzer extends TestCase { + + public void testStopWord() throws Exception { + assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); + } + + private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + for (int i=0; i instead of . + * This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer. + * + * The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc) + * Fix is easy: refine this spec to exclude thai punctuation and digits. + * + * A better fix, that would also fix quite a few other languages would be to remove the thai hack. + * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks! + */ + public void testBuggyTokenType() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, + new String[] { "", "", "", "", "", "" }); + } + + /* correct testcase + public void testTokenType() throws Exception { + assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", + new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, + new String[] { "", "", "", "", "", "" }); + } + */ + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); @@ -40,10 +75,28 @@ Token nextToken = ts.next(reusableToken); assertNotNull(nextToken); assertEquals(nextToken.term(), output[i]); + if (startOffsets != null) + assertEquals(nextToken.startOffset(), startOffsets[i]); + if (endOffsets != null) + assertEquals(nextToken.endOffset(), endOffsets[i]); + if (types != null) + assertEquals(nextToken.type(), types[i]); } assertNull(ts.next(reusableToken)); ts.close(); } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { + assertAnalyzesTo(a, input, output, null, null, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception { + assertAnalyzesTo(a, input, output, null, null, types); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception { + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null); + } public void testAnalyzer() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer();