Index: solr/core/src/test/org/apache/solr/analysis/TestLatinNumberConvertFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestLatinNumberConvertFilterFactory.java (revision 0) +++ solr/core/src/test/org/apache/solr/analysis/TestLatinNumberConvertFilterFactory.java (revision 0) @@ -0,0 +1,49 @@ +package org.apache.solr.analysis; + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + + +/** + * Simple tests to ensure the Latin stem factory is working. + */ +public class TestLatinNumberConvertFilterFactory extends BaseTokenStreamTestCase { + public void testNumberFormatStrictDefault() throws Exception { + Reader reader = new StringReader("IC VC IL S I i V IV VI VII X XA XIX MXM"); + LatinNumberConvertFilterFactory factory = new LatinNumberConvertFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] {"99", "VC", "49", "S", "1", "1", "5", "4", "6", "7", "10", "XA", "19", "1990" }); + } + + public void testNumberFormatStrictFalse() throws Exception { + // init Filter + Map args = new HashMap(); + args.put("strictMode", "false"); + Reader reader = new StringReader("IC VC IL S I i V IV VI VII X XA XIX MXM"); + LatinNumberConvertFilterFactory factory = new LatinNumberConvertFilterFactory(); + factory.init(args); + + // test + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] {"99", "VC", "49", "S", "1", "1", "5", "4", "6", "7", "10", "XA", "19", "1990" }); + } + + public void testNumberFormatStrictTrue() throws Exception { + // init Filter + Map args = new HashMap(); + args.put("strictMode", "true"); + Reader reader = new StringReader("IC VC IL S I i V IV VI VII X XA XIX MXM"); + LatinNumberConvertFilterFactory factory = new LatinNumberConvertFilterFactory(); + factory.init(args); + + // test + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] {"IC", "VC", "IL", "S", "1", "1", "5", "4", "6", "7", "10", "XA", "19", "MXM" }); + } +} Index: solr/core/src/test/org/apache/solr/analysis/TestLatinStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestLatinStemFilterFactory.java (revision 0) +++ solr/core/src/test/org/apache/solr/analysis/TestLatinStemFilterFactory.java (revision 0) @@ -0,0 +1,21 @@ +package org.apache.solr.analysis; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Latin stem factory is working. + */ +public class TestLatinStemFilterFactory extends BaseTokenStreamTestCase { + + public void testLatinStemFilterFactory() throws Exception { + Reader reader = new StringReader("adultero filius filivs FILIVS filii atque"); + LatinStemFilterFactory factory = new LatinStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] {"adulter", "adulteri", "fil", "filiu", "fil", "filiu", "FIL", "FILIU", "fili", "filii", "atque", "atque"}); + } +} Index: solr/core/src/java/org/apache/solr/analysis/LatinStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LatinStemFilterFactory.java (revision 0) +++ solr/core/src/java/org/apache/solr/analysis/LatinStemFilterFactory.java (revision 0) @@ -0,0 +1,23 @@ +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.la.LatinStemFilter; +import org.apache.lucene.analysis.la.TestLatinStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TestLatinStemFilter}. + *
+ * <fieldType name="text_latin" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LatinStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LatinStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new LatinStemFilter(input); + } +} Index: solr/core/src/java/org/apache/solr/analysis/LatinNumberConvertFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LatinNumberConvertFilterFactory.java (revision 0) +++ solr/core/src/java/org/apache/solr/analysis/LatinNumberConvertFilterFactory.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.la.LatinNumberConvertFilter; +import org.apache.lucene.analysis.la.TestLatinStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TestLatinStemFilter}. + *
+ * <fieldType name="text_latin" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LatinNumberConvertFilterFactory" strictMode="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LatinNumberConvertFilterFactory extends TokenFilterFactory { + + /** flag thats indicates the computation mode */ + private boolean strictMode = false; + + @Override + public void init(Map args) { + super.init(args); + this.strictMode = getBoolean("strictMode", false); + } + + + public TokenStream create(TokenStream input) { + return new LatinNumberConvertFilter(input, this.strictMode); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinNumberConvertFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinNumberConvertFilter.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinNumberConvertFilter.java (revision 0) @@ -0,0 +1,44 @@ +package org.apache.lucene.analysis.la; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.la.LatinNumberConvertFilter; + + +import static org.apache.lucene.analysis.VocabularyAssert.*; + +/** + * Simple tests for {@link LatinNumberConvertFilter} + */ +public class TestLatinNumberConvertFilter extends BaseTokenStreamTestCase { + private Analyzer analyzerStrictTrue = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new LatinNumberConvertFilter(source, true)); + } + }; + + private Analyzer analyzerStrictFalse = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new LatinNumberConvertFilter(source, false)); + } + }; + + /** Test against a vocabulary from the reference impl - strictMode="false" */ + public void testVocabularyStrictTrue() throws IOException { + assertVocabulary(analyzerStrictTrue, getDataFile("latinNumberTestData.zip"), "latinNumberTestDataStrictTrue.txt"); + } + + /** Test against a vocabulary from the reference impl - strictMode="false" */ + public void testVocabularyStrictFalse() throws IOException { + assertVocabulary(analyzerStrictFalse, getDataFile("latinNumberTestData.zip"), "latinNumberTestDataStrictFalse.txt"); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinStemFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinStemFilter.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/la/TestLatinStemFilter.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.analysis.la; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.zip.ZipFile; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.la.LatinStemFilter; + +/** + * Simple tests for {@link LatinStemFilter} + */ +public class TestLatinStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new LatinStemFilter(source)); + } + }; + + /** Test against a sample vocabulary from the reference impl */ + public void testSampleVocabulary() throws IOException { + assertLatinVocabulary(analyzer, getDataFile("latinTestData.zip"), "latinTestData.txt"); + } + + /** Test against a complete vocabulary from the reference impl */ + public void testCompleteVocabulary() throws IOException { + assertLatinVocabulary(analyzer, getDataFile("latinTestData.zip"), "latinTestData_complete.txt"); + } + + // helper methods (adapted from VocabularyAssert, BaseTokenStreamTestCase) + private void assertLatinVocabulary(Analyzer a, File zipFile, String vocOut) throws IOException { + ZipFile zip = new ZipFile(zipFile); + InputStream vo = zip.getInputStream(zip.getEntry(vocOut)); + this.assertLatinVocabulary(a, vo); + vo.close(); + zip.close(); + } + + private void assertLatinVocabulary(Analyzer a, InputStream vocOut) throws IOException { + BufferedReader vocReader = new BufferedReader(new InputStreamReader(vocOut, "UTF-8")); + String inputLine = null; + while ((inputLine = vocReader.readLine()) != null) { + if (inputLine.startsWith("#") || inputLine.trim().length() == 0) { + continue; /** comment */ + } + + String words[] = inputLine.split("\t"); + if (words.length != 3) { + continue; /** invalid input */ + } + + BaseTokenStreamTestCase.assertAnalyzesToReuse(a, words[0], new String[]{words[1], words[2]}); + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConverter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConverter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConverter.java (revision 0) @@ -0,0 +1,217 @@ +package org.apache.lucene.analysis.la; + +import java.util.HashMap; + +/** + * Latin Number Converter. + * @author Markus Klose, Waldemar Erhardt + */ +public class LatinNumberConverter { + + + private boolean strict = false; + + // create a map with the latin letters and their corresponding values + private static final HashMap latinArabicMap = new HashMap(); + + static { + latinArabicMap.put('m', 1000); + latinArabicMap.put('d', 500); + latinArabicMap.put('c', 100); + latinArabicMap.put('l', 50); + latinArabicMap.put('x', 10); + latinArabicMap.put('v', 5); + latinArabicMap.put('i', 1); + } + + /** default constructor */ + public LatinNumberConverter(boolean strict) { + this.strict = strict; + } + + /** + * entry point for converting latin number to arabic one. + * + * @author Markus Klose + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * arabic value of latin number; null if token is not a number + */ + public String format(char termBuffer[], int termLength) { + return this.latinToArabic(termBuffer, termLength); + } + + /** + * doing the conversion. + * + * @author Markus Klose, Waldemar Erhardt + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * arabic value of latin number; null if token is not a number + * + */ + private String latinToArabic(char termBuffer[], int termLength) { + + // check if term is a valid latin number + if (this.validate(termBuffer, termLength)) { + + String latin = String.valueOf(termBuffer, 0, termLength); + String arabic = this.convertLatinToArabic(latin, strict); + if(latin.equals(arabic)) { + return null; + } + return arabic; + + } else { + return null; + } + } + + + /** + * Converts the latin letter into an arabic value. + * @param latin The latin letter. + * @return The arabic value or 0. + * @author Waldemar Erhardt + */ + private int convertLatinToArabic(char latin) { + + Integer arabic = latinArabicMap.get(Character.toLowerCase(latin)); + if(arabic != null) { + return arabic.intValue(); + } + return 0; + } + + /** + * Check if the given latin letter is a main char. Main chars are 'M', 'C', 'X' and 'I'. + * @param latin The latin letter to check. + * @return true if the letter is a main char, else false + * @author Waldemar Erhardt + */ + private boolean isMainChar(char latin) { + latin = Character.toLowerCase(latin); + return latin == 'm' || latin == 'c' || latin == 'x' || latin == 'i'; + + } + + private static final int HELP_CHAR = 0; + private static final int MAIN_CHAR = 1; + private static final int SUBTRACTION = 2; + + /** + * Converts the given latin letters into an arabic value.
+ * (Based on http://www.diaware.de/html/roemzahl.html) + * @param latin The latin value to convert. + * @param strict Whether stricter validation should be used or not. + * @return The arabic value of the latin value, or if some validation + * was violated the original latin value. + * @author Waldemar Erhardt + */ + private String convertLatinToArabic(String latin, boolean strict) { + + int maxValue = convertLatinToArabic('M') + 1; + int oldValue = convertLatinToArabic('M'); + int currentArabicValue = 0; + int charCounter = 0; + int charType = HELP_CHAR; // 1 = main char; 0 = help char; 2 = subtraction + int arabicValue = 0; + + for(int i = 0; i < latin.length(); i++) { + // check if its a subtraction + if ((i + 1) < latin.length() && convertLatinToArabic(latin.charAt(i)) < convertLatinToArabic(latin.charAt(i + 1))) { + // a subtraction needs always a main char before another char + if (isMainChar(latin.charAt(i)) == false) { + // syntax error: help char is not allowed to be the first char + return latin; + } + // it is an subtraction + currentArabicValue = convertLatinToArabic(latin.charAt(i + 1)) - convertLatinToArabic(latin.charAt(i)); + charType = SUBTRACTION; + } else { + currentArabicValue = convertLatinToArabic(latin.charAt(i)); + charType = (isMainChar(latin.charAt(i)) == true) ? MAIN_CHAR : HELP_CHAR; + } + + if (oldValue < currentArabicValue) { + // syntax error: the chars are not ordered with decreasing valency from left to right + return latin; + } + + // if strict mode is activated, do further validation + if (strict) { + if (charType != SUBTRACTION && maxValue < convertLatinToArabic(latin.charAt(i))) { + // syntax error: subtraction rule violated + return latin; + } + if (charType == SUBTRACTION && maxValue < convertLatinToArabic(latin.charAt(i + 1))) { + // syntax error: subtraction rule violated + return latin; + } + if (charType == SUBTRACTION && (convertLatinToArabic(latin.charAt(i + 1)) / convertLatinToArabic(latin.charAt(i))) > 10) { + // syntax error: subtraction rule violated + return latin; + } + } + + if (i > 0 && charType != SUBTRACTION && oldValue == currentArabicValue) { + charCounter++; + if (charType == MAIN_CHAR && charCounter == 3) { + // syntax error: it is not allowed to have more than 3 main char in succession + return latin; + } + if (charType == HELP_CHAR && charCounter == 1) { + // syntax error: it is not allowed to have the same help char in succession + return latin; + } + } else { + charCounter = 0; + } + + // prepare for next iteration + arabicValue += currentArabicValue; + oldValue = currentArabicValue; + if (charType == SUBTRACTION) { + maxValue = convertLatinToArabic(latin.charAt(i)); + } + if (charType == SUBTRACTION) { + i++; + } + } + return String.valueOf(arabicValue); + } + + + /** + * validating the token. + * + * @author Markus Klose + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * if there are non valid chars return false, else true + */ + private boolean validate(char termBuffer[], int termLength) { + char toValidate; + + for (int i = 0; i < termLength; i++) { + toValidate = Character.toLowerCase(termBuffer[i]); + if (toValidate != 'i' && toValidate != 'v' && toValidate != 'x' && toValidate != 'l' + && toValidate != 'c' && toValidate != 'd' && toValidate != 'm') + return false; + } + return true; + } + +} \ No newline at end of file Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemmer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemmer.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemmer.java (revision 0) @@ -0,0 +1,209 @@ +package org.apache.lucene.analysis.la; + +import java.util.Arrays; +import java.util.List; + +/** + * Latin Stemmer. + * based on http://snowball.tartarus.org/otherapps/schinke/intro.html + * @author Markus Klose + */ +public class LatinStemmer { + //TODO queList as txt file an property in schema.xml ??? + + /** list contains words ending with 'que' that should not be stemmed */ + private List queList; + + /** + * default constructor. + * + * @author mk + */ + public LatinStemmer() { + // initialize the queList + queList = Arrays.asList("atque", "quoque", "neque", "itaque", "absque", "apsque", "abusque", "adaeque", "adusque", "denique", + "deque", "susque", "oblique", "peraeque", "plenisque", "quandoque", "quisque", "quaeque", + "cuiusque", "cuique", "quemque", "quamque", "quaque", "quique", "quorumque", "quarumque", + "quibusque", "quosque", "quasque", "quotusquisque", "quousque", "ubique", "undique", "usque", + "uterque", "utique", "utroque", "utribique", "torque", "coque", "concoque", "contorque", + "detorque", "decoque", "excoque", "extorque", "obtorque", "optorque", "retorque", "recoque", + "attorque", "incoque", "intorque", "praetorque"); + } + + /** + * check if token ends with 'que' and if it should be stemmed + * @author mk + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * current termLength (termLength - 3' if token ends with 'que'),
if token should not be stemmed return -1 + */ + public int stemQUE(char[] termBuffer, int termLength) { + // buffer to token + String currentToken = String.valueOf(termBuffer, 0, termLength).toLowerCase(); + + // check if token should be stemmed + if (queList.contains(currentToken)) { + // dont stem the token + return -1; + } + + // chekc if token ends with 'que' + if (currentToken.endsWith("que")) { + // cut of 'que' + return termLength - 3; + } + return termLength; + } + + + /** + * removing known noun suffixe.
+ * changes to the snowball - additional suffixe: arum, erum, orum, ebus, uum, ium, ei, ui, im + * @author mk + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * termLength after stemming + */ + public String stemAsNoun(char termBuffer[], int termLength) { + // buffer to string + String noun = String.valueOf(termBuffer, 0, termLength).toLowerCase(); + + // check longest suffix + if ((noun.endsWith("ibus") || noun.endsWith("arum") || noun.endsWith("erum") || noun.endsWith("orum") || noun.endsWith("ebus")) && noun.length() >= 6) { + return String.valueOf(termBuffer, 0, termLength - 4); + } else if ((noun.endsWith("ius") || noun.endsWith("uum") || noun.endsWith("ium")) && noun.length() >= 5) { + return String.valueOf(termBuffer, 0, termLength - 3); + } else if ((noun.endsWith("ae") || noun.endsWith("am") || noun.endsWith("as") || noun.endsWith("em") || noun.endsWith("es") + || noun.endsWith("ia") || noun.endsWith("is") || noun.endsWith("nt") || noun.endsWith("os") || noun.endsWith("ud") + || noun.endsWith("um") || noun.endsWith("us") || noun.endsWith("ei") || noun.endsWith("ui") || noun.endsWith("im")) + && noun.length() >= 4) { + return String.valueOf(termBuffer, 0, termLength - 2); + } else if ((noun.endsWith("a") || noun.endsWith("e") || noun.endsWith("i") || noun.endsWith("o") || noun.endsWith("u")) && noun.length() >= 3) { + return String.valueOf(termBuffer, 0, termLength - 1); + } + + // stem nothing + return String.valueOf(termBuffer, 0, termLength); + } + + /** + * removing / changing known verb suffixe.
+ * @author mk + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * termLength after stemming + */ + public String stemAsVerb(char termBuffer[], int termLength) { + // buffer to string + String verb = String.valueOf(termBuffer, 0, termLength).toLowerCase(); + + // check suffixe + if (verb.endsWith("iuntur") || verb.endsWith("erunt") || verb.endsWith("untur") || verb.endsWith("iunt") || verb.endsWith("unt")) { + // 'iuntur' 'erunt' 'untur' 'iunt' 'unt' -> 'i' + return this.verbSuffixToI(termBuffer, termLength); + } else if (verb.endsWith("beris") || verb.endsWith("bor") || verb.endsWith("bo")) { + // 'beris' 'bor' 'bo' -> 'bi' + return this.verbSuffixToBI(termBuffer, termLength); + } else if (verb.endsWith("ero") && termLength >= 5) { + // 'ero' -> 'eri' + termBuffer[termLength -1] = 'i'; + return String.valueOf(termBuffer, 0, termLength); + } else if ((verb.endsWith("mini") || verb.endsWith("ntur") || verb.endsWith("stis")) && termLength >= 6) { + // 'mini' 'ntur' 'stis' -> delete + return String.valueOf(termBuffer, 0, termLength - 4); + } else if ((verb.endsWith("mus") || verb.endsWith("mur") || verb.endsWith("ris") || verb.endsWith("sti") || verb.endsWith("tis") || verb.endsWith("tur")) && termLength >= 5) { + // 'mus' 'ris' 'sti' 'tis' 'tur' -> delete + return String.valueOf(termBuffer, 0, termLength - 3); + } else if ((verb.endsWith("ns") || verb.endsWith("nt") || verb.endsWith("ri")) && termLength >= 4) { + // 'ns' 'nt' 'ri' -> delete + return String.valueOf(termBuffer, 0, termLength - 2); + } else if ((verb.endsWith("m") || verb.endsWith("r") || verb.endsWith("s") || verb.endsWith("t")) && termLength >= 3) { + // 'm' 'r' 's' 't' -> delete + return String.valueOf(termBuffer, 0, termLength - 1); + } + + // stem nothing + return String.valueOf(termBuffer, 0, termLength); + } + /** + * general verb suffixe + * pr�sens indikativ aktiv -> o, s, t, mus, tis, (u)nt, is, it, imus, itis + * pr�sens konjunktiv aktiv -> am, as, at, amus, atis, ant, iam, ias, iat, iamus, iatis, iant + * + * imperfekt indikativ aktiv -> bam,bas,bat,bamus,batis,bant, ebam,ebas,ebat,ebamus,ebatis,ebant + * imperfekt konjunktiv aktiv -> rem,res,ret,remus,retis,rent, erem,eres,eret,eremus,eretis,erent + * + * futur 1 indikativ aktiv -> bo,bis,bit,bimus,bitis,bunt, am,es,et,emus,etis,ent, iam,ies,iet,iemus,ietis,ient + * futur 2 indikativ aktiv -> + * + * perfekt indikativ aktiv -> i,isti,it,imus,istis,erunt, + * perfekt konjunktiv aktiv -> erim,eris,erit,erimus,eritis,erint + * + * plusquamperfekt indikativ aktiv -> eram,eras,erat,eramus,eratis,erant + * plusquamperfekt konjunktiv aktiv -> issem,isses,isset,issemus,issetis,issent + */ + + // helper methods + /** + * replacing suffix with 'i' + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * stemmed verb + */ + private String verbSuffixToI(char termBuffer[], int termLength) { + String verb = String.valueOf(termBuffer, 0, termLength).toLowerCase(); + // 'iuntur' 'erunt' 'untur' 'iunt' 'unt' -> 'i' + if (verb.endsWith("iuntur") && termLength >= 8) { + return String.valueOf(termBuffer, 0, termLength - 5); + } else if ((verb.endsWith("erunt") || verb.endsWith("untur")) && termLength >= 7) { + termBuffer[termLength - 5] = 'i'; + return String.valueOf(termBuffer, 0, termLength - 4); + } else if (verb.endsWith("iunt") && termLength >= 6) {; + return String.valueOf(termBuffer, 0, termLength - 3); + } else if (verb.endsWith("unt") && termLength >= 5) { + termBuffer[termLength - 3] = 'i'; + return String.valueOf(termBuffer, 0, termLength - 2); + } + return String.valueOf(termBuffer, 0, termLength); + } + + /** + * replacing suffix with 'bi' + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + * @return + * stemmed verb + */ + private String verbSuffixToBI(char termBuffer[], int termLength) { + String verb = String.valueOf(termBuffer, 0, termLength).toLowerCase(); + // 'beris' 'bor' 'bo' -> 'bi' + if (verb.endsWith("beris") && termLength >= 7) { + termBuffer[termLength - 4] = 'i'; + return String.valueOf(termBuffer, 0, termLength - 3); + } else if (verb.endsWith("bor") && termLength >= 5) { + termBuffer[termLength - 2] = 'i'; + return String.valueOf(termBuffer, 0, termLength - 1); + } else if (verb.endsWith("bo") && termLength >= 4) {; + termBuffer[termLength - 1] = 'i'; + return String.valueOf(termBuffer, 0, termLength); + } + return String.valueOf(termBuffer, 0, termLength); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConvertFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConvertFilter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinNumberConvertFilter.java (revision 0) @@ -0,0 +1,49 @@ +package org.apache.lucene.analysis.la; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link LatinNumberConverter} to convert latin numbers. + * @author Markus Klose, Waldemar Erhardt + */ +public class LatinNumberConvertFilter extends TokenFilter { + /** converter */ + private final LatinNumberConverter numberFormatter; + + /** attributes */ + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + /** default constructor */ + public LatinNumberConvertFilter(TokenStream input, boolean strictMode) { + super(input); + this.numberFormatter = new LatinNumberConverter(strictMode); + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + + // token is secured by KeywordMarkerFilter -> dont stem + if (keywordAttr.isKeyword()) { + return true; + } + + + final String arabicNumber = numberFormatter.format(termAtt.buffer(), termAtt.length()); + //change CharTermAttribute if not null + if (arabicNumber != null) { + termAtt.setEmpty().append(arabicNumber); + termAtt.setLength(arabicNumber.length()); + } + return true; + } else { + return false; + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemFilter.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/la/LatinStemFilter.java (revision 0) @@ -0,0 +1,142 @@ +package org.apache.lucene.analysis.la; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + +/** + * A {@link TokenFilter} that applies {@link LatinStemmer} to stem Latin words. + * @author Markus Klose + */ +public final class LatinStemFilter extends TokenFilter { + /** stemmer */ + private final LatinStemmer stemmer = new LatinStemmer(); + + /** attributes */ + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + /** flag that indicates if input should be incremented */ + private boolean stemAsNoun = true; + + /** token types */ + public static final String TYPE_NOUN = "LATIN_NOUN"; + public static final String TYPE_VERB = "LATIN_VERB"; + + /** current token information */ + private char[] currentTokenBuffer; + private int currentTokenLength; + private int currentTokenStart; + private int currentTokenEnd; + private int currentTokenPosition; + + /** default constructor */ + public LatinStemFilter(TokenStream input) { + super(input); + } + + /** + * Entry point for latin stemming.
+ * step 1 - replace 'v' with 'u' and 'j' with 'i'
+ * step 2 - check for tokens ending with 'que'
+ * step 3 - stem nouns or verb + * + * @author markus klose + * + * @see org.apache.lucene.analysis.TokenStream#incrementToken() + */ + @Override + public final boolean incrementToken() throws IOException { + if (currentTokenBuffer == null) { + if (!input.incrementToken()) { + return false; + } else { + // token is secured by KeywordMarkerFilter -> dont stem + if (keywordAttr.isKeyword()) { + return true; + } + + // buffer current input + currentTokenBuffer = termAtt.buffer().clone(); + currentTokenLength = termAtt.length(); + currentTokenStart = offsetAtt.startOffset(); + currentTokenEnd = offsetAtt.endOffset(); + currentTokenPosition = posIncAtt.getPositionIncrement(); + } + } + + // reset token attributes + clearAttributes(); + + /** step 1 - replace 'v' and 'j' (case sensitive) */ + this.replaceVJ(currentTokenBuffer, currentTokenLength); + + String stemmedToken; + /** step 2 - check for words to stem ending with 'que' */ + int termLength = stemmer.stemQUE(currentTokenBuffer, currentTokenLength); + if (termLength == -1) { + // write original buffer as noun and verb + stemmedToken = String.valueOf(currentTokenBuffer, 0, currentTokenLength); + } else { + /** step 3 - stem as noun or verb */ + if (stemAsNoun) { + stemmedToken = stemmer.stemAsNoun(currentTokenBuffer, termLength); + } else { + stemmedToken = stemmer.stemAsVerb(currentTokenBuffer, termLength); + } + } + + // switch from noun to verb or vice versa + String tokenType; + if(stemAsNoun) { + stemAsNoun = false; + tokenType = TYPE_NOUN; + posIncAtt.setPositionIncrement(currentTokenPosition); + } else { + stemAsNoun = true; + tokenType = TYPE_VERB; + // reset buffer + currentTokenBuffer = null; + currentTokenLength = -1; + posIncAtt.setPositionIncrement(0); + } + + // create output token + termAtt.setEmpty().append(stemmedToken); + termAtt.setLength(stemmedToken.length()); + offsetAtt.setOffset(currentTokenStart, currentTokenEnd); + typeAtt.setType(tokenType); + + return true; + } + + /** + * Replace replace 'v' with 'u' and 'j' with 'i' (case sensitive). + * + * @author markus klose + * + * @param termBuffer + * term buffer containing token + * @param termLength + * length of the token + */ + private void replaceVJ(char termBuffer[], int termLength) { + for (int i = 0; i < termLength; i++) { + switch(termBuffer[i]) { + case 'V': termBuffer[i] = 'U'; break; + case 'v': termBuffer[i] = 'u'; break; + case 'J': termBuffer[i] = 'I'; break; + case 'j': termBuffer[i] = 'i'; break; + } + } + } +}