Index: solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (revision 0)
+++ solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (revision 0)
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Latvian stem factory is working.
+ */
+public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("tirgiem tirgus");
+ LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
+ }
+}
Property changes on: solr\src\test\org\apache\solr\analysis\TestLatvianStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (revision 0)
+++ solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (revision 0)
@@ -0,0 +1,38 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.lv.LatvianStemFilter;
+
+/**
+ * Factory for {@link LatvianStemFilter}.
+ *
+ * <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.LatvianStemFilterFactory"/>
+ * </analyzer>
+ * </fieldType>
+ */
+public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
+ public TokenStream create(TokenStream input) {
+ return new LatvianStemFilter(input);
+ }
+}
Property changes on: solr\src\java\org\apache\solr\analysis\LatvianStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (revision 0)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new LatvianAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "tirgiem", "tirg");
+ checkOneTermReuse(a, "tirgus", "tirg");
+ // stopword
+ assertAnalyzesTo(a, "un", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set exclusionSet = new HashSet();
+ exclusionSet.add("tirgiem");
+ Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
+ LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "tirgiem", "tirgiem");
+ checkOneTermReuse(a, "tirgus", "tirg");
+ }
+}
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\lv\TestLatvianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (revision 0)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (revision 0)
@@ -0,0 +1,272 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Basic tests for {@link LatvianStemmer}
+ */
+public class TestLatvianStemmer extends BaseTokenStreamTestCase {
+ private Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
+ }
+ };
+
+ public void testNouns1() throws IOException {
+ // decl. I
+ checkOneTerm(a, "tēvs", "tēv"); // nom. sing.
+ checkOneTerm(a, "tēvi", "tēv"); // nom. pl.
+ checkOneTerm(a, "tēva", "tēv"); // gen. sing.
+ checkOneTerm(a, "tēvu", "tēv"); // gen. pl.
+ checkOneTerm(a, "tēvam", "tēv"); // dat. sing.
+ checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
+ checkOneTerm(a, "tēvu", "tēv"); // acc. sing.
+ checkOneTerm(a, "tēvus", "tēv"); // acc. pl.
+ checkOneTerm(a, "tēvā", "tēv"); // loc. sing.
+ checkOneTerm(a, "tēvos", "tēv"); // loc. pl.
+ checkOneTerm(a, "tēvs", "tēv"); // voc. sing.
+ checkOneTerm(a, "tēvi", "tēv"); // voc. pl.
+ }
+
+ /**
+ * decl II nouns with (s,t) -> š and (d,z) -> ž
+ * palatalization will generally conflate to two stems
+ * due to the ambiguity (plural and singular).
+ */
+ public void testNouns2() throws IOException {
+ // decl. II
+
+ // c -> č palatalization
+ checkOneTerm(a, "lācis", "lāc"); // nom. sing.
+ checkOneTerm(a, "lāči", "lāc"); // nom. pl.
+ checkOneTerm(a, "lāča", "lāc"); // gen. sing.
+ checkOneTerm(a, "lāču", "lāc"); // gen. pl.
+ checkOneTerm(a, "lācim", "lāc"); // dat. sing.
+ checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
+ checkOneTerm(a, "lāci", "lāc"); // acc. sing.
+ checkOneTerm(a, "lāčus", "lāc"); // acc. pl.
+ checkOneTerm(a, "lācī", "lāc"); // loc. sing.
+ checkOneTerm(a, "lāčos", "lāc"); // loc. pl.
+ checkOneTerm(a, "lāci", "lāc"); // voc. sing.
+ checkOneTerm(a, "lāči", "lāc"); // voc. pl.
+
+ // n -> ņ palatalization
+ checkOneTerm(a, "akmens", "akmen"); // nom. sing.
+ checkOneTerm(a, "akmeņi", "akmen"); // nom. pl.
+ checkOneTerm(a, "akmens", "akmen"); // gen. sing.
+ checkOneTerm(a, "akmeņu", "akmen"); // gen. pl.
+ checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
+ checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
+ checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
+ checkOneTerm(a, "akmeņus", "akmen"); // acc. pl.
+ checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
+ checkOneTerm(a, "akmeņos", "akmen"); // loc. pl.
+ checkOneTerm(a, "akmens", "akmen"); // voc. sing.
+ checkOneTerm(a, "akmeņi", "akmen"); // voc. pl.
+
+ // no palatalization
+ checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
+ checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
+ checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
+ checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
+ checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
+ checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
+ checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
+ checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
+ checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
+ checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
+ checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
+ checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
+ }
+
+ public void testNouns3() throws IOException {
+ // decl III
+ checkOneTerm(a, "lietus", "liet"); // nom. sing.
+ checkOneTerm(a, "lieti", "liet"); // nom. pl.
+ checkOneTerm(a, "lietus", "liet"); // gen. sing.
+ checkOneTerm(a, "lietu", "liet"); // gen. pl.
+ checkOneTerm(a, "lietum", "liet"); // dat. sing.
+ checkOneTerm(a, "lietiem", "liet"); // dat. pl.
+ checkOneTerm(a, "lietu", "liet"); // acc. sing.
+ checkOneTerm(a, "lietus", "liet"); // acc. pl.
+ checkOneTerm(a, "lietū", "liet"); // loc. sing.
+ checkOneTerm(a, "lietos", "liet"); // loc. pl.
+ checkOneTerm(a, "lietus", "liet"); // voc. sing.
+ checkOneTerm(a, "lieti", "liet"); // voc. pl.
+ }
+
+ public void testNouns4() throws IOException {
+ // decl IV
+ checkOneTerm(a, "lapa", "lap"); // nom. sing.
+ checkOneTerm(a, "lapas", "lap"); // nom. pl.
+ checkOneTerm(a, "lapas", "lap"); // gen. sing.
+ checkOneTerm(a, "lapu", "lap"); // gen. pl.
+ checkOneTerm(a, "lapai", "lap"); // dat. sing.
+ checkOneTerm(a, "lapām", "lap"); // dat. pl.
+ checkOneTerm(a, "lapu", "lap"); // acc. sing.
+ checkOneTerm(a, "lapas", "lap"); // acc. pl.
+ checkOneTerm(a, "lapā", "lap"); // loc. sing.
+ checkOneTerm(a, "lapās", "lap"); // loc. pl.
+ checkOneTerm(a, "lapa", "lap"); // voc. sing.
+ checkOneTerm(a, "lapas", "lap"); // voc. pl.
+
+ checkOneTerm(a, "puika", "puik"); // nom. sing.
+ checkOneTerm(a, "puikas", "puik"); // nom. pl.
+ checkOneTerm(a, "puikas", "puik"); // gen. sing.
+ checkOneTerm(a, "puiku", "puik"); // gen. pl.
+ checkOneTerm(a, "puikam", "puik"); // dat. sing.
+ checkOneTerm(a, "puikām", "puik"); // dat. pl.
+ checkOneTerm(a, "puiku", "puik"); // acc. sing.
+ checkOneTerm(a, "puikas", "puik"); // acc. pl.
+ checkOneTerm(a, "puikā", "puik"); // loc. sing.
+ checkOneTerm(a, "puikās", "puik"); // loc. pl.
+ checkOneTerm(a, "puika", "puik"); // voc. sing.
+ checkOneTerm(a, "puikas", "puik"); // voc. pl.
+ }
+
+ /**
+ * Genitive plural forms with (s,t) -> š and (d,z) -> ž
+ * will not conflate due to ambiguity.
+ */
+ public void testNouns5() throws IOException {
+ // decl V
+ // l -> ļ palatalization
+ checkOneTerm(a, "egle", "egl"); // nom. sing.
+ checkOneTerm(a, "egles", "egl"); // nom. pl.
+ checkOneTerm(a, "egles", "egl"); // gen. sing.
+ checkOneTerm(a, "egļu", "egl"); // gen. pl.
+ checkOneTerm(a, "eglei", "egl"); // dat. sing.
+ checkOneTerm(a, "eglēm", "egl"); // dat. pl.
+ checkOneTerm(a, "egli", "egl"); // acc. sing.
+ checkOneTerm(a, "egles", "egl"); // acc. pl.
+ checkOneTerm(a, "eglē", "egl"); // loc. sing.
+ checkOneTerm(a, "eglēs", "egl"); // loc. pl.
+ checkOneTerm(a, "egle", "egl"); // voc. sing.
+ checkOneTerm(a, "egles", "egl"); // voc. pl.
+ }
+
+ public void testNouns6() throws IOException {
+ // decl VI
+
+ // no palatalization
+ checkOneTerm(a, "govs", "gov"); // nom. sing.
+ checkOneTerm(a, "govis", "gov"); // nom. pl.
+ checkOneTerm(a, "govs", "gov"); // gen. sing.
+ checkOneTerm(a, "govju", "gov"); // gen. pl.
+ checkOneTerm(a, "govij", "gov"); // dat. sing.
+ checkOneTerm(a, "govīm", "gov"); // dat. pl.
+ checkOneTerm(a, "govi ", "gov"); // acc. sing.
+ checkOneTerm(a, "govis", "gov"); // acc. pl.
+ checkOneTerm(a, "govi ", "gov"); // inst. sing.
+ checkOneTerm(a, "govīm", "gov"); // inst. pl.
+ checkOneTerm(a, "govī", "gov"); // loc. sing.
+ checkOneTerm(a, "govīs", "gov"); // loc. pl.
+ checkOneTerm(a, "govs", "gov"); // voc. sing.
+ checkOneTerm(a, "govis", "gov"); // voc. pl.
+ }
+
+ public void testAdjectives() throws IOException {
+ checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
+ checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
+ checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
+ checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
+ checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
+ checkOneTerm(a, "zilā", "zil"); // def. nom. fem. sing.
+ checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
+ checkOneTerm(a, "zilās", "zil"); // def. nom. fem. pl.
+ checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
+ checkOneTerm(a, "zilā", "zil"); // def. gen. masc. sing.
+ checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
+ checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
+ checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
+ checkOneTerm(a, "zilās", "zil"); // def. gen. fem. sing.
+ checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
+ checkOneTerm(a, "zilo", "zil"); // indef. gen. fem. pl.
+ checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
+ checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
+ checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
+ checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
+ checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
+ checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
+ checkOneTerm(a, "zilām", "zil"); // indef. dat. fem. pl.
+ checkOneTerm(a, "zilajām", "zil"); // indef. dat. fem. pl.
+ checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
+ checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
+ checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
+ checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
+ checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
+ checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
+ checkOneTerm(a, "zilās", "zil"); // indef. acc. fem. pl.
+ checkOneTerm(a, "zilās", "zil"); // def. acc. fem. pl.
+ checkOneTerm(a, "zilā", "zil"); // indef. loc. masc. sing.
+ checkOneTerm(a, "zilajā", "zil"); // def. loc. masc. sing.
+ checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
+ checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
+ checkOneTerm(a, "zilā", "zil"); // indef. loc. fem. sing.
+ checkOneTerm(a, "zilajā", "zil"); // def. loc. fem. sing.
+ checkOneTerm(a, "zilās", "zil"); // indef. loc. fem. pl.
+ checkOneTerm(a, "zilajās", "zil"); // def. loc. fem. pl.
+ checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
+ checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
+ checkOneTerm(a, "zilā", "zil"); // voc. fem. sing.
+ checkOneTerm(a, "zilās", "zil"); // voc. fem. pl.
+ }
+
+ /**
+ * Note: we intentionally don't handle the ambiguous
+ * (s,t) -> š and (d,z) -> ž
+ */
+ public void testPalatalization() throws IOException {
+ checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
+ checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
+ checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
+ checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
+ checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
+ checkOneTerm(a, "kāpšļu", "kāpsl"); // gen. pl.
+ checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
+ checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
+ checkOneTerm(a, "vilnis", "viln"); // nom. sing.
+ checkOneTerm(a, "viļņu", "viln"); // gen. pl.
+ checkOneTerm(a, "lelle", "lell"); // nom. sing.
+ checkOneTerm(a, "leļļu", "lell"); // gen. pl.
+ checkOneTerm(a, "pinne", "pinn"); // nom. sing.
+ checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
+ checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
+ checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
+ }
+
+ /**
+ * Test some length restrictions, we require a 3+ char stem,
+ * with at least one vowel.
+ */
+ public void testLength() throws IOException {
+ checkOneTerm(a, "usa", "usa"); // length
+ checkOneTerm(a, "60ms", "60ms"); // vowel count
+ }
+}
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\lv\TestLatvianStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Latvian.
+ */
+public final class LatvianAnalyzer extends StopwordAnalyzerBase {
+ private final Set> stemExclusionSet;
+
+ /** File containing default Latvian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public LatvianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public LatvianAnalyzer(Version matchVersion, Set> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public LatvianAnalyzer(Version matchVersion, Set> stopwords, Set> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+ * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * provided and {@link LatvianStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new LatvianStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (revision 0)
@@ -0,0 +1,175 @@
+package org.apache.lucene.analysis.lv;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light stemmer for Latvian.
+ *
+ * This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+ * A stemming algorithm for Latvian with the following modifications:
+ *
+ * - Only explicitly stems noun and adjective morphology
+ *
- Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+ *
- Removes only the primary inflectional suffixes: case and number for nouns ;
+ * case, number, gender, and definitiveness for adjectives.
+ *
- Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+ *
+ *
+ */
+public class LatvianStemmer {
+ /**
+ * Stem a latvian word. returns the new adjusted length.
+ */
+ public int stem(char s[], int len) {
+ int numVowels = numVowels(s, len);
+
+ for (int i = 0; i < affixes.length; i++) {
+ Affix affix = affixes[i];
+ if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
+ len -= affix.affix.length;
+ return affix.palatalizes ? unpalatalize(s, len) : len;
+ }
+ }
+
+ return len;
+ }
+
+ static final Affix affixes[] = {
+ new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
+ new Affix("ajam", 2, false), new Affix("ajām", 2, false),
+ new Affix("ajos", 2, false), new Affix("ajās", 2, false),
+ new Affix("iem", 2, true), new Affix("ajā", 2, false),
+ new Affix("ais", 2, false), new Affix("ai", 2, false),
+ new Affix("ei", 2, false), new Affix("ām", 1, false),
+ new Affix("am", 1, false), new Affix("ēm", 1, false),
+ new Affix("īm", 1, false), new Affix("im", 1, false),
+ new Affix("um", 1, false), new Affix("us", 1, true),
+ new Affix("as", 1, false), new Affix("ās", 1, false),
+ new Affix("es", 1, false), new Affix("os", 1, true),
+ new Affix("ij", 1, false), new Affix("īs", 1, false),
+ new Affix("ēs", 1, false), new Affix("is", 1, false),
+ new Affix("ie", 1, false), new Affix("u", 1, true),
+ new Affix("a", 1, true), new Affix("i", 1, true),
+ new Affix("e", 1, false), new Affix("ā", 1, false),
+ new Affix("ē", 1, false), new Affix("ī", 1, false),
+ new Affix("ū", 1, false), new Affix("o", 1, false),
+ new Affix("s", 0, false), new Affix("š", 0, false),
+ };
+
+ static class Affix {
+ char affix[]; // suffix
+ int vc; // vowel count of the suffix
+ boolean palatalizes; // true if we should fire palatalization rules.
+
+ Affix(String affix, int vc, boolean palatalizes) {
+ this.affix = affix.toCharArray();
+ this.vc = vc;
+ this.palatalizes = palatalizes;
+ }
+ }
+
+ /**
+ * Most cases are handled except for the ambiguous ones:
+ *
+ * - s -> š
+ *
- t -> š
+ *
- d -> ž
+ *
- z -> ž
+ *
+ */
+ private int unpalatalize(char s[], int len) {
+ // we check the character removed: if its -u then
+ // its 2,5, or 6 gen pl., and these two can only apply then.
+ if (s[len] == 'u') {
+ // kš -> kst
+ if (endsWith(s, len, "kš")) {
+ len++;
+ s[len-2] = 's';
+ s[len-1] = 't';
+ return len;
+ }
+ // ņņ -> nn
+ if (endsWith(s, len, "ņņ")) {
+ s[len-2] = 'n';
+ s[len-1] = 'n';
+ return len;
+ }
+ }
+
+ // otherwise all other rules
+ if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
+ || endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
+ // labial consonant
+ return len-1;
+ } else if (endsWith(s, len, "šņ")) {
+ s[len-2] = 's';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "žņ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "šļ")) {
+ s[len-2] = 's';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "žļ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "ļņ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "ļļ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'č') {
+ s[len-1] = 'c';
+ return len;
+ } else if (s[len-1] == 'ļ') {
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'ņ') {
+ s[len-1] = 'n';
+ return len;
+ }
+
+ return len;
+ }
+
+ /**
+ * Count the vowels in the string, we always require at least
+ * one in the remaining stem to accept it.
+ */
+ private int numVowels(char s[], int len) {
+ int n = 0;
+ for (int i = 0; i < len; i++) {
+ switch(s[i]) {
+ case 'a': case 'e': case 'i':
+ case 'o': case 'u': case 'ā':
+ case 'ī': case 'ē': case 'ū':
+ n++;
+ }
+ }
+ return n;
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (revision 0)
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
+ * words.
+ *
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ *
+ */
+public final class LatvianStemFilter extends TokenFilter {
+ private final LatvianStemmer stemmer = new LatvianStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public LatvianStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\LatvianStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (revision 0)
@@ -0,0 +1,22 @@
+
+
+
+
+Analyzer for Latvian.
+
+
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\lv\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt
===================================================================
--- modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (revision 0)
+++ modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (revision 0)
@@ -0,0 +1,172 @@
+# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
+# the original list of over 800 forms was refined:
+# pronouns, adverbs, interjections were removed
+#
+# prepositions
+aiz
+ap
+ar
+apakš
+ārpus
+augšpus
+bez
+caur
+dēļ
+gar
+iekš
+iz
+kopš
+labad
+lejpus
+līdz
+no
+otrpus
+pa
+par
+pār
+pēc
+pie
+pirms
+pret
+priekš
+starp
+šaipus
+uz
+viņpus
+virs
+virspus
+zem
+apakšpus
+# Conjunctions
+un
+bet
+jo
+ja
+ka
+lai
+tomēr
+tikko
+turpretī
+arī
+kaut
+gan
+tādēļ
+tā
+ne
+tikvien
+vien
+kā
+ir
+te
+vai
+kamēr
+# Particles
+ar
+diezin
+droši
+diemžēl
+nebūt
+ik
+it
+taču
+nu
+pat
+tiklab
+iekšpus
+nedz
+tik
+nevis
+turpretim
+jeb
+iekam
+iekām
+iekāms
+kolīdz
+līdzko
+tiklīdz
+jebšu
+tālab
+tāpēc
+nekā
+itin
+jā
+jau
+jel
+nē
+nezin
+tad
+tikai
+vis
+tak
+iekams
+vien
+# modal verbs
+būt
+biju
+biji
+bija
+bijām
+bijāt
+esmu
+esi
+esam
+esat
+būšu
+būsi
+būs
+būsim
+būsiet
+tikt
+tiku
+tiki
+tika
+tikām
+tikāt
+tieku
+tiec
+tiek
+tiekam
+tiekat
+tikšu
+tiks
+tiksim
+tiksiet
+tapt
+tapi
+tapāt
+topat
+tapšu
+tapsi
+taps
+tapsim
+tapsiet
+kļūt
+kļuvu
+kļuvi
+kļuva
+kļuvām
+kļuvāt
+kļūstu
+kļūsti
+kļūst
+kļūstam
+kļūstat
+kļūšu
+kļūsi
+kļūs
+kļūsim
+kļūsiet
+# verbs
+varēt
+varēju
+varējām
+varēšu
+varēsim
+var
+varēji
+varējāt
+varēsi
+varēsiet
+varat
+varēja
+varēs
Property changes on: modules\analysis\common\src\resources\org\apache\lucene\analysis\lv\stopwords.txt
___________________________________________________________________
Added: svn:eol-style
+ native