Index: solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (revision 0) +++ solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Latvian stem factory is working. + */ +public class TestLatvianStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("tirgiem tirgus"); + LatvianStemFilterFactory factory = new LatvianStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "tirg", "tirg" }); + } +} Property changes on: solr\src\test\org\apache\solr\analysis\TestLatvianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (revision 0) +++ solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.lv.LatvianStemFilter; + +/** + * Factory for {@link LatvianStemFilter}. + *
+ * <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.LatvianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class LatvianStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new LatvianStemFilter(input); + } +} Property changes on: solr\src\java\org\apache\solr\analysis\LatvianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.lv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class TestLatvianAnalyzer extends BaseTokenStreamTestCase { + /** This test fails with NPE when the + * stopwords file is missing in classpath */ + public void testResourcesAvailable() { + new LatvianAnalyzer(TEST_VERSION_CURRENT); + } + + /** test stopwords and stemming */ + public void testBasics() throws IOException { + Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT); + // stemming + checkOneTermReuse(a, "tirgiem", "tirg"); + checkOneTermReuse(a, "tirgus", "tirg"); + // stopword + assertAnalyzesTo(a, "un", new String[] {}); + } + + /** test use of exclusion set */ + public void testExclude() throws IOException { + Set exclusionSet = new HashSet(); + exclusionSet.add("tirgiem"); + Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, + LatvianAnalyzer.getDefaultStopSet(), exclusionSet); + checkOneTermReuse(a, "tirgiem", "tirgiem"); + checkOneTermReuse(a, "tirgus", "tirg"); + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\lv\TestLatvianAnalyzer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.analysis.lv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +/** + * Basic tests for {@link LatvianStemmer} + */ +public class TestLatvianStemmer extends BaseTokenStreamTestCase { + private Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer)); + } + }; + + public void testNouns1() throws IOException { + // decl. I + checkOneTerm(a, "tēvs", "tēv"); // nom. sing. + checkOneTerm(a, "tēvi", "tēv"); // nom. pl. + checkOneTerm(a, "tēva", "tēv"); // gen. sing. + checkOneTerm(a, "tēvu", "tēv"); // gen. pl. + checkOneTerm(a, "tēvam", "tēv"); // dat. sing. + checkOneTerm(a, "tēviem", "tēv"); // dat. pl. + checkOneTerm(a, "tēvu", "tēv"); // acc. sing. + checkOneTerm(a, "tēvus", "tēv"); // acc. pl. + checkOneTerm(a, "tēvā", "tēv"); // loc. sing. + checkOneTerm(a, "tēvos", "tēv"); // loc. pl. + checkOneTerm(a, "tēvs", "tēv"); // voc. sing. + checkOneTerm(a, "tēvi", "tēv"); // voc. pl. + } + + /** + * decl II nouns with (s,t) -> š and (d,z) -> ž + * palatalization will generally conflate to two stems + * due to the ambiguity (plural and singular). + */ + public void testNouns2() throws IOException { + // decl. II + + // c -> č palatalization + checkOneTerm(a, "lācis", "lāc"); // nom. sing. + checkOneTerm(a, "lāči", "lāc"); // nom. pl. + checkOneTerm(a, "lāča", "lāc"); // gen. sing. + checkOneTerm(a, "lāču", "lāc"); // gen. pl. + checkOneTerm(a, "lācim", "lāc"); // dat. sing. + checkOneTerm(a, "lāčiem", "lāc"); // dat. pl. + checkOneTerm(a, "lāci", "lāc"); // acc. sing. + checkOneTerm(a, "lāčus", "lāc"); // acc. pl. + checkOneTerm(a, "lācī", "lāc"); // loc. sing. + checkOneTerm(a, "lāčos", "lāc"); // loc. pl. + checkOneTerm(a, "lāci", "lāc"); // voc. sing. + checkOneTerm(a, "lāči", "lāc"); // voc. pl. + + // n -> ņ palatalization + checkOneTerm(a, "akmens", "akmen"); // nom. sing. + checkOneTerm(a, "akmeņi", "akmen"); // nom. pl. + checkOneTerm(a, "akmens", "akmen"); // gen. sing. + checkOneTerm(a, "akmeņu", "akmen"); // gen. pl. + checkOneTerm(a, "akmenim", "akmen"); // dat. sing. + checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl. + checkOneTerm(a, "akmeni", "akmen"); // acc. sing. + checkOneTerm(a, "akmeņus", "akmen"); // acc. pl. + checkOneTerm(a, "akmenī", "akmen"); // loc. sing. + checkOneTerm(a, "akmeņos", "akmen"); // loc. pl. + checkOneTerm(a, "akmens", "akmen"); // voc. sing. + checkOneTerm(a, "akmeņi", "akmen"); // voc. pl. + + // no palatalization + checkOneTerm(a, "kurmis", "kurm"); // nom. sing. + checkOneTerm(a, "kurmji", "kurm"); // nom. pl. + checkOneTerm(a, "kurmja", "kurm"); // gen. sing. + checkOneTerm(a, "kurmju", "kurm"); // gen. pl. + checkOneTerm(a, "kurmim", "kurm"); // dat. sing. + checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl. + checkOneTerm(a, "kurmi", "kurm"); // acc. sing. + checkOneTerm(a, "kurmjus", "kurm"); // acc. pl. + checkOneTerm(a, "kurmī", "kurm"); // loc. sing. + checkOneTerm(a, "kurmjos", "kurm"); // loc. pl. + checkOneTerm(a, "kurmi", "kurm"); // voc. sing. + checkOneTerm(a, "kurmji", "kurm"); // voc. pl. + } + + public void testNouns3() throws IOException { + // decl III + checkOneTerm(a, "lietus", "liet"); // nom. sing. + checkOneTerm(a, "lieti", "liet"); // nom. pl. + checkOneTerm(a, "lietus", "liet"); // gen. sing. + checkOneTerm(a, "lietu", "liet"); // gen. pl. + checkOneTerm(a, "lietum", "liet"); // dat. sing. + checkOneTerm(a, "lietiem", "liet"); // dat. pl. + checkOneTerm(a, "lietu", "liet"); // acc. sing. + checkOneTerm(a, "lietus", "liet"); // acc. pl. + checkOneTerm(a, "lietū", "liet"); // loc. sing. + checkOneTerm(a, "lietos", "liet"); // loc. pl. + checkOneTerm(a, "lietus", "liet"); // voc. sing. + checkOneTerm(a, "lieti", "liet"); // voc. pl. + } + + public void testNouns4() throws IOException { + // decl IV + checkOneTerm(a, "lapa", "lap"); // nom. sing. + checkOneTerm(a, "lapas", "lap"); // nom. pl. + checkOneTerm(a, "lapas", "lap"); // gen. sing. + checkOneTerm(a, "lapu", "lap"); // gen. pl. + checkOneTerm(a, "lapai", "lap"); // dat. sing. + checkOneTerm(a, "lapām", "lap"); // dat. pl. + checkOneTerm(a, "lapu", "lap"); // acc. sing. + checkOneTerm(a, "lapas", "lap"); // acc. pl. + checkOneTerm(a, "lapā", "lap"); // loc. sing. + checkOneTerm(a, "lapās", "lap"); // loc. pl. + checkOneTerm(a, "lapa", "lap"); // voc. sing. + checkOneTerm(a, "lapas", "lap"); // voc. pl. + + checkOneTerm(a, "puika", "puik"); // nom. sing. + checkOneTerm(a, "puikas", "puik"); // nom. pl. + checkOneTerm(a, "puikas", "puik"); // gen. sing. + checkOneTerm(a, "puiku", "puik"); // gen. pl. + checkOneTerm(a, "puikam", "puik"); // dat. sing. + checkOneTerm(a, "puikām", "puik"); // dat. pl. + checkOneTerm(a, "puiku", "puik"); // acc. sing. + checkOneTerm(a, "puikas", "puik"); // acc. pl. + checkOneTerm(a, "puikā", "puik"); // loc. sing. + checkOneTerm(a, "puikās", "puik"); // loc. pl. + checkOneTerm(a, "puika", "puik"); // voc. sing. + checkOneTerm(a, "puikas", "puik"); // voc. pl. + } + + /** + * Genitive plural forms with (s,t) -> š and (d,z) -> ž + * will not conflate due to ambiguity. + */ + public void testNouns5() throws IOException { + // decl V + // l -> ļ palatalization + checkOneTerm(a, "egle", "egl"); // nom. sing. + checkOneTerm(a, "egles", "egl"); // nom. pl. + checkOneTerm(a, "egles", "egl"); // gen. sing. + checkOneTerm(a, "egļu", "egl"); // gen. pl. + checkOneTerm(a, "eglei", "egl"); // dat. sing. + checkOneTerm(a, "eglēm", "egl"); // dat. pl. + checkOneTerm(a, "egli", "egl"); // acc. sing. + checkOneTerm(a, "egles", "egl"); // acc. pl. + checkOneTerm(a, "eglē", "egl"); // loc. sing. + checkOneTerm(a, "eglēs", "egl"); // loc. pl. + checkOneTerm(a, "egle", "egl"); // voc. sing. + checkOneTerm(a, "egles", "egl"); // voc. pl. + } + + public void testNouns6() throws IOException { + // decl VI + + // no palatalization + checkOneTerm(a, "govs", "gov"); // nom. sing. + checkOneTerm(a, "govis", "gov"); // nom. pl. + checkOneTerm(a, "govs", "gov"); // gen. sing. + checkOneTerm(a, "govju", "gov"); // gen. pl. + checkOneTerm(a, "govij", "gov"); // dat. sing. + checkOneTerm(a, "govīm", "gov"); // dat. pl. + checkOneTerm(a, "govi ", "gov"); // acc. sing. + checkOneTerm(a, "govis", "gov"); // acc. pl. + checkOneTerm(a, "govi ", "gov"); // inst. sing. + checkOneTerm(a, "govīm", "gov"); // inst. pl. + checkOneTerm(a, "govī", "gov"); // loc. sing. + checkOneTerm(a, "govīs", "gov"); // loc. pl. + checkOneTerm(a, "govs", "gov"); // voc. sing. + checkOneTerm(a, "govis", "gov"); // voc. pl. + } + + public void testAdjectives() throws IOException { + checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing. + checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing. + checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl. + checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl. + checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing. + checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing. + checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl. + checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl. + checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing. + checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing. + checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl. + checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl. + checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing. + checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing. + checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl. + checkOneTerm(a, "zilo", "zil"); // indef. gen. fem. pl. + checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing. + checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing. + checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl. + checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl. + checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing. + checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing. + checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl. + checkOneTerm(a, "zilajām", "zil"); // indef. dat. fem. pl. + checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing. + checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing. + checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl. + checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl. + checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing. + checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing. + checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl. + checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl. + checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing. + checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing. + checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl. + checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl. + checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing. + checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing. + checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl. + checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl. + checkOneTerm(a, "zilais", "zil"); // voc. masc. sing. + checkOneTerm(a, "zilie", "zil"); // voc. masc. pl. + checkOneTerm(a, "zilā", "zil"); // voc. fem. sing. + checkOneTerm(a, "zilās", "zil"); // voc. fem. pl. + } + + /** + * Note: we intentionally don't handle the ambiguous + * (s,t) -> š and (d,z) -> ž + */ + public void testPalatalization() throws IOException { + checkOneTerm(a, "krāsns", "krāsn"); // nom. sing. + checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl. + checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing. + checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl. + checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing. + checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl. + checkOneTerm(a, "zizlis", "zizl"); // nom. sing. + checkOneTerm(a, "zižļu", "zizl"); // gen. pl. + checkOneTerm(a, "vilnis", "viln"); // nom. sing. + checkOneTerm(a, "viļņu", "viln"); // gen. pl. + checkOneTerm(a, "lelle", "lell"); // nom. sing. + checkOneTerm(a, "leļļu", "lell"); // gen. pl. + checkOneTerm(a, "pinne", "pinn"); // nom. sing. + checkOneTerm(a, "piņņu", "pinn"); // gen. pl. + checkOneTerm(a, "rīkste", "rīkst"); // nom. sing. + checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl. + } + + /** + * Test some length restrictions, we require a 3+ char stem, + * with at least one vowel. + */ + public void testLength() throws IOException { + checkOneTerm(a, "usa", "usa"); // length + checkOneTerm(a, "60ms", "60ms"); // vowel count + } +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\lv\TestLatvianStemmer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (revision 0) @@ -0,0 +1,129 @@ +package org.apache.lucene.analysis.lv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.StopwordAnalyzerBase; +import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.Version; + +/** + * {@link Analyzer} for Latvian. + */ +public final class LatvianAnalyzer extends StopwordAnalyzerBase { + private final Set stemExclusionSet; + + /** File containing default Latvian stopwords. */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Returns an unmodifiable instance of the default stop words set. + * @return default stop words set. + */ + public static Set getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final Set DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, + DEFAULT_STOPWORD_FILE); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } + } + } + + /** + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. + */ + public LatvianAnalyzer(Version matchVersion) { + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Builds an analyzer with the given stop words. + * + * @param matchVersion lucene compatibility version + * @param stopwords a stopword set + */ + public LatvianAnalyzer(Version matchVersion, Set stopwords) { + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); + } + + /** + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is + * provided this analyzer will add a {@link KeywordMarkerFilter} before + * stemming. + * + * @param matchVersion lucene compatibility version + * @param stopwords a stopword set + * @param stemExclusionSet a set of terms not to be stemmed + */ + public LatvianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + super(matchVersion, stopwords); + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( + matchVersion, stemExclusionSet)); + } + + /** + * Creates a + * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} + * which tokenizes all the text in the provided {@link Reader}. + * + * @return A + * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} + * built from an {@link StandardTokenizer} filtered with + * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} + * , {@link KeywordMarkerFilter} if a stem exclusion set is + * provided and {@link LatvianStemFilter}. + */ + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(matchVersion, source); + result = new LowerCaseFilter(matchVersion, result); + result = new StopFilter(matchVersion, result, stopwords); + if(!stemExclusionSet.isEmpty()) + result = new KeywordMarkerFilter(result, stemExclusionSet); + result = new LatvianStemFilter(result); + return new TokenStreamComponents(source, result); + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianAnalyzer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (revision 0) @@ -0,0 +1,175 @@ +package org.apache.lucene.analysis.lv; + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Light stemmer for Latvian. + *

+ * This is a light version of the algorithm in Karlis Kreslin's PhD thesis + * A stemming algorithm for Latvian with the following modifications: + *

+ * + */ +public class LatvianStemmer { + /** + * Stem a latvian word. returns the new adjusted length. + */ + public int stem(char s[], int len) { + int numVowels = numVowels(s, len); + + for (int i = 0; i < affixes.length; i++) { + Affix affix = affixes[i]; + if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) { + len -= affix.affix.length; + return affix.palatalizes ? unpalatalize(s, len) : len; + } + } + + return len; + } + + static final Affix affixes[] = { + new Affix("ajiem", 3, false), new Affix("ajai", 3, false), + new Affix("ajam", 2, false), new Affix("ajām", 2, false), + new Affix("ajos", 2, false), new Affix("ajās", 2, false), + new Affix("iem", 2, true), new Affix("ajā", 2, false), + new Affix("ais", 2, false), new Affix("ai", 2, false), + new Affix("ei", 2, false), new Affix("ām", 1, false), + new Affix("am", 1, false), new Affix("ēm", 1, false), + new Affix("īm", 1, false), new Affix("im", 1, false), + new Affix("um", 1, false), new Affix("us", 1, true), + new Affix("as", 1, false), new Affix("ās", 1, false), + new Affix("es", 1, false), new Affix("os", 1, true), + new Affix("ij", 1, false), new Affix("īs", 1, false), + new Affix("ēs", 1, false), new Affix("is", 1, false), + new Affix("ie", 1, false), new Affix("u", 1, true), + new Affix("a", 1, true), new Affix("i", 1, true), + new Affix("e", 1, false), new Affix("ā", 1, false), + new Affix("ē", 1, false), new Affix("ī", 1, false), + new Affix("ū", 1, false), new Affix("o", 1, false), + new Affix("s", 0, false), new Affix("š", 0, false), + }; + + static class Affix { + char affix[]; // suffix + int vc; // vowel count of the suffix + boolean palatalizes; // true if we should fire palatalization rules. + + Affix(String affix, int vc, boolean palatalizes) { + this.affix = affix.toCharArray(); + this.vc = vc; + this.palatalizes = palatalizes; + } + } + + /** + * Most cases are handled except for the ambiguous ones: + * + */ + private int unpalatalize(char s[], int len) { + // we check the character removed: if its -u then + // its 2,5, or 6 gen pl., and these two can only apply then. + if (s[len] == 'u') { + // kš -> kst + if (endsWith(s, len, "kš")) { + len++; + s[len-2] = 's'; + s[len-1] = 't'; + return len; + } + // ņņ -> nn + if (endsWith(s, len, "ņņ")) { + s[len-2] = 'n'; + s[len-1] = 'n'; + return len; + } + } + + // otherwise all other rules + if (endsWith(s, len, "pj") || endsWith(s, len, "bj") + || endsWith(s, len, "mj") || endsWith(s, len, "vj")) { + // labial consonant + return len-1; + } else if (endsWith(s, len, "šņ")) { + s[len-2] = 's'; + s[len-1] = 'n'; + return len; + } else if (endsWith(s, len, "žņ")) { + s[len-2] = 'z'; + s[len-1] = 'n'; + return len; + } else if (endsWith(s, len, "šļ")) { + s[len-2] = 's'; + s[len-1] = 'l'; + return len; + } else if (endsWith(s, len, "žļ")) { + s[len-2] = 'z'; + s[len-1] = 'l'; + return len; + } else if (endsWith(s, len, "ļņ")) { + s[len-2] = 'l'; + s[len-1] = 'n'; + return len; + } else if (endsWith(s, len, "ļļ")) { + s[len-2] = 'l'; + s[len-1] = 'l'; + return len; + } else if (s[len-1] == 'č') { + s[len-1] = 'c'; + return len; + } else if (s[len-1] == 'ļ') { + s[len-1] = 'l'; + return len; + } else if (s[len-1] == 'ņ') { + s[len-1] = 'n'; + return len; + } + + return len; + } + + /** + * Count the vowels in the string, we always require at least + * one in the remaining stem to accept it. + */ + private int numVowels(char s[], int len) { + int n = 0; + for (int i = 0; i < len; i++) { + switch(s[i]) { + case 'a': case 'e': case 'i': + case 'o': case 'u': case 'ā': + case 'ī': case 'ē': case 'ū': + n++; + } + } + return n; + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianStemmer.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (revision 0) @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.lv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class LatvianStemFilter extends TokenFilter { + private final LatvianStemmer stemmer = new LatvianStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public LatvianStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianStemFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (revision 0) @@ -0,0 +1,22 @@ + + + + +Analyzer for Latvian. + + Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\package.html ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt =================================================================== --- modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (revision 0) +++ modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (revision 0) @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs Property changes on: modules\analysis\common\src\resources\org\apache\lucene\analysis\lv\stopwords.txt ___________________________________________________________________ Added: svn:eol-style + native