Index: modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff (revision ) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff (revision ) @@ -0,0 +1,13 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 2 +SFX A 0 e n +SFX A 0 e t + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +PFX B Y 1 +PFX B 0 s o \ No newline at end of file Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (revision ) @@ -0,0 +1,112 @@ +package org.apache.lucene.analysis.hunspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple + * stems, this filter can emit multiple tokens for each consumed token + */ +public final class HunspellStemFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final HunspellStemmer stemmer; + + private List buffer; + private State savedState; + + private final boolean dedup; + + /** + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * HunspellDictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + */ + public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) { + this(input, dictionary, true); + } + + /** + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * HunspellDictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param dedup true if only unique terms should be output. + */ + public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) { + super(input); + this.dedup = dedup; + this.stemmer = new HunspellStemmer(dictionary); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean incrementToken() throws IOException { + if (buffer != null && !buffer.isEmpty()) { + Stem nextStem = buffer.remove(0); + restoreState(savedState); + posIncAtt.setPositionIncrement(0); + termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); + termAtt.setLength(nextStem.getStemLength()); + return true; + } + + if (!input.incrementToken()) { + return false; + } + + buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); + + if (buffer.isEmpty()) { // we do not know this word, return it unchanged + return true; + } + + Stem stem = buffer.remove(0); + termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); + termAtt.setLength(stem.getStemLength()); + + if (!buffer.isEmpty()) { + savedState = captureState(); + } + + return true; + } + + /** + * {@inheritDoc} + */ + @Override + public void reset() throws IOException { + super.reset(); + buffer = null; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java (revision ) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.hunspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +public class HunspellWord { + + private final char flags[]; // sorted, can we represent more concisely? + + /** + * Creates a new HunspellWord with no associated flags + */ + public HunspellWord() { + flags = null; + } + + /** + * Constructs a new HunspellWord with the given flags + * + * @param flags Flags to associate with the word + */ + public HunspellWord(char[] flags) { + this.flags = flags; + } + + /** + * Checks whether the word has the given flag associated with it + * + * @param flag Flag to check whether it is associated with the word + * @return {@code true} if the flag is associated, {@code false} otherwise + */ + public boolean hasFlag(char flag) { + return flags != null && Arrays.binarySearch(flags, flag) >= 0; + } + + /** + * Returns the flags associated with the word + * + * @return Flags asssociated with the word + */ + public char[] getFlags() { + return flags; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (revision ) @@ -0,0 +1,372 @@ +package org.apache.lucene.analysis.hunspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.*; + +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.Version; + +/** + * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It + * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. + */ +public class HunspellStemmer { + + private static final int RECURSION_CAP = 2; + + private final HunspellDictionary dictionary; + private final StringBuilder segment = new StringBuilder(); + + /** + * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems + * + * @param dictionary HunspellDictionary that will be used to create the stems + */ + public HunspellStemmer(HunspellDictionary dictionary) { + this.dictionary = dictionary; + } + + /** + * Find the stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(String word) { + return stem(word.toCharArray(), word.length()); + } + + /** + * Find the stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(char word[], int length) { + List stems = new ArrayList(); + if (dictionary.lookupWord(word, 0, length) != null) { + stems.add(new Stem(word, length)); + } + stems.addAll(stem(word, length, null, 0)); + return stems; + } + + /** + * Find the unique stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List uniqueStems(char word[], int length) { + List stems = new ArrayList(); + CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false); + if (dictionary.lookupWord(word, 0, length) != null) { + stems.add(new Stem(word, length)); + terms.add(word); + } + List otherStems = stem(word, length, null, 0); + for (Stem s : otherStems) { + if (!terms.contains(s.stem)) { + stems.add(s); + terms.add(s.stem); + } + } + return stems; + } + + // ================================================= Helper Methods ================================================ + + /** + * Generates a list of stems for the provided word + * + * @param word Word to generate the stems for + * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems, pr an empty if no stems are found + */ + private List stem(char word[], int length, char[] flags, int recursionDepth) { + List stems = new ArrayList(); + + for (int i = 0; i < length; i++) { + List suffixes = dictionary.lookupSuffix(word, i, length - i); + if (suffixes == null) { + continue; + } + + for (HunspellAffix suffix : suffixes) { + if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { + int deAffixedLength = length - suffix.getAppend().length(); + // TODO: can we do this in-place? + String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); + for (Stem stem : stemList) { + stem.addSuffix(suffix); + } + + stems.addAll(stemList); + } + } + } + + for (int i = length - 1; i >= 0; i--) { + List prefixes = dictionary.lookupPrefix(word, 0, i); + if (prefixes == null) { + continue; + } + + for (HunspellAffix prefix : prefixes) { + if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { + int deAffixedStart = prefix.getAppend().length(); + int deAffixedLength = length - deAffixedStart; + + String strippedWord = new StringBuilder().append(prefix.getStrip()) + .append(word, deAffixedStart, deAffixedLength) + .toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); + for (Stem stem : stemList) { + stem.addPrefix(prefix); + } + + stems.addAll(stemList); + } + } + } + + return stems; + } + + /** + * Applies the affix rule to the given word, producing a list of stems if any are found + * + * @param strippedWord Word the affix has been removed and the strip added + * @param affix HunspellAffix representing the affix rule itself + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems for the word, or an empty list if none are found + */ + @SuppressWarnings("unchecked") + public List applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) { + segment.setLength(0); + segment.append(strippedWord, 0, length); + if (!affix.checkCondition(segment)) { + return Collections.EMPTY_LIST; + } + + List stems = new ArrayList(); + + List words = dictionary.lookupWord(strippedWord, 0, length); + if (words != null) { + for (HunspellWord hunspellWord : words) { + if (hunspellWord.hasFlag(affix.getFlag())) { + stems.add(new Stem(strippedWord, length)); + } + } + } + + if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) { + stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); + } + + return stems; + } + + /** + * Checks if the given flag cross checks with the given array of flags + * + * @param flag Flag to cross check with the array of flags + * @param flags Array of flags to cross check against. Can be {@code null} + * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise + */ + private boolean hasCrossCheckedFlag(char flag, char[] flags) { + return flags == null || Arrays.binarySearch(flags, flag) >= 0; + } + + /** + * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes + * that were used to change the word into the stem. + */ + public static class Stem { + + private final List prefixes = new ArrayList(); + private final List suffixes = new ArrayList(); + private final char stem[]; + private final int stemLength; + + /** + * Creates a new Stem wrapping the given word stem + * + * @param stem Stem of a word + */ + public Stem(char stem[], int stemLength) { + this.stem = stem; + this.stemLength = stemLength; + } + + /** + * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added + * depth first, the prefix is added to the front of the list + * + * @param prefix Prefix to add to the list of prefixes for this stem + */ + public void addPrefix(HunspellAffix prefix) { + prefixes.add(0, prefix); + } + + /** + * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added + * depth first, the suffix is added to the end of the list + * + * @param suffix Suffix to add to the list of suffixes for this stem + */ + public void addSuffix(HunspellAffix suffix) { + suffixes.add(suffix); + } + + /** + * Returns the list of prefixes used to generate the stem + * + * @return List of prefixes used to generate the stem or an empty list if no prefixes were required + */ + public List getPrefixes() { + return prefixes; + } + + /** + * Returns the list of suffixes used to generate the stem + * + * @return List of suffixes used to generate the stem or an empty list if no suffixes were required + */ + public List getSuffixes() { + return suffixes; + } + + /** + * Returns the actual word stem itself + * + * @return Word stem itself + */ + public char[] getStem() { + return stem; + } + + /** + * @return the stemLength + */ + public int getStemLength() { + return stemLength; + } + + public String getStemString() { + return new String(stem, 0, stemLength); + } + + } + + + // ================================================= Entry Point =================================================== + + /** + * HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file + * + * @param args Program arguments. Should contain location of affix file and location of dic file + * @throws IOException Can be thrown while reading from the files + * @throws ParseException Can be thrown while parsing the files + */ + public static void main(String[] args) throws IOException, ParseException { + if (args.length != 2) { + System.out.println("usage: HunspellStemmer "); + System.exit(1); + } + + InputStream affixInputStream = new FileInputStream(args[0]); + InputStream dicInputStream = new FileInputStream(args[1]); + + HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40); + + affixInputStream.close(); + dicInputStream.close(); + + HunspellStemmer stemmer = new HunspellStemmer(dictionary); + + Scanner scanner = new Scanner(System.in); + + System.out.print("> "); + while (scanner.hasNextLine()) { + String word = scanner.nextLine(); + + if ("exit".equals(word)) { + break; + } + + printStemResults(word, stemmer.stem(word.toCharArray(), word.length())); + + System.out.print("> "); + } + } + + /** + * Prints the results of the stemming of a word + * + * @param originalWord Word that has been stemmed + * @param stems Stems of the word + */ + private static void printStemResults(String originalWord, List stems) { + StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n"); + + for (Stem stem : stems) { + builder.append("- ").append(stem.getStem()).append(": "); + + for (HunspellAffix prefix : stem.getPrefixes()) { + builder.append(prefix.getAppend()).append("+"); + + if (hasText(prefix.getStrip())) { + builder.append(prefix.getStrip()).append("-"); + } + } + + builder.append(stem.getStem()); + + for (HunspellAffix suffix : stem.getSuffixes()) { + if (hasText(suffix.getStrip())) { + builder.append("-").append(suffix.getStrip()); + } + + builder.append("+").append(suffix.getAppend()); + } + builder.append("\n"); + } + + System.out.println(builder); + } + + /** + * Simple utility to check if the given String has any text + * + * @param str String to check if it has any text + * @return {@code true} if the String has text, {@code false} otherwise + */ + private static boolean hasText(String str) { + return str != null && str.length() > 0; + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java (revision ) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java (revision ) @@ -0,0 +1,44 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Version; +import org.junit.Test; + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; + +import static junit.framework.Assert.assertEquals; + +public class HunspellDictionaryTest { + + @Test + public void testHunspellDictionary_loadDicAff() throws IOException, ParseException { + InputStream affixStream = getClass().getResourceAsStream("test.aff"); + InputStream dictStream = getClass().getResourceAsStream("test.dic"); + + HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40); + assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); + + affixStream.close(); + dictStream.close(); + } +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java (revision ) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java (revision ) @@ -0,0 +1,76 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Version; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.List; + +import static junit.framework.Assert.assertEquals; + +public class HunspellStemmerTest { + + private static HunspellStemmer stemmer; + + @BeforeClass + public static void beforeClass() throws IOException, ParseException { + InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); + InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic"); + + HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40); + stemmer = new HunspellStemmer(dictionary); + + affixStream.close(); + dictStream.close(); + } + + @Test + public void testStem_simpleSuffix() { + List stems = stemmer.stem("lucene"); + + assertEquals(2, stems.size()); + assertEquals("lucene", stems.get(0).getStemString()); + assertEquals("lucen", stems.get(1).getStemString()); + + stems = stemmer.stem("mahoute"); + assertEquals(1, stems.size()); + assertEquals("mahout", stems.get(0).getStemString()); + } + + @Test + public void testStem_simplePrefix() { + List stems = stemmer.stem("solr"); + + assertEquals(1, stems.size()); + assertEquals("olr", stems.get(0).getStemString()); + } + + @Test + public void testStem_recursiveSuffix() { + List stems = stemmer.stem("abcd"); + + assertEquals(1, stems.size()); + assertEquals("ab", stems.get(0).getStemString()); + } + +} Index: modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic (revision ) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic (revision ) @@ -0,0 +1,6 @@ +5 +lucen/A +lucene +mahout/A +olr/B +ab/C \ No newline at end of file Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java (revision ) @@ -0,0 +1,157 @@ +package org.apache.lucene.analysis.hunspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.regex.Pattern; + +/** + * Wrapper class representing a hunspell affix + */ +public class HunspellAffix { + + private String append; // the affix itself, what is appended + private char appendFlags[]; // continuation class flags + private String strip; + + private String condition; + private Pattern conditionPattern; + + private char flag; + + private boolean crossProduct; + + /** + * Checks whether the given text matches the conditional pattern on this affix + * + * @param text Text to check if it matches the affix's conditional pattern + * @return {@code true} if the text meets the condition, {@code false} otherwise + */ + public boolean checkCondition(CharSequence text) { + return conditionPattern.matcher(text).matches(); + } + + /** + * Returns the append defined for the affix + * + * @return Defined append + */ + public String getAppend() { + return append; + } + + /** + * Sets the append defined for the affix + * + * @param append Defined append for the affix + */ + public void setAppend(String append) { + this.append = append; + } + + /** + * Returns the flags defined for the affix append + * + * @return Flags defined for the affix append + */ + public char[] getAppendFlags() { + return appendFlags; + } + + /** + * Sets the flags defined for the affix append + * + * @param appendFlags Flags defined for the affix append + */ + public void setAppendFlags(char[] appendFlags) { + this.appendFlags = appendFlags; + } + + /** + * Returns the stripping characters defined for the affix + * + * @return Stripping characters defined for the affix + */ + public String getStrip() { + return strip; + } + + /** + * Sets the stripping characters defined for the affix + * + * @param strip Stripping characters defined for the affix + */ + public void setStrip(String strip) { + this.strip = strip; + } + + /** + * Returns the condition that must be met before the affix can be applied + * + * @return Condition that must be met before the affix can be applied + */ + public String getCondition() { + return condition; + } + + /** + * Sets the condition that must be met before the affix can be applied + * + * @param condition Condition to be met before affix application + * @param pattern Condition as a regular expression pattern + */ + public void setCondition(String condition, String pattern) { + this.condition = condition; + this.conditionPattern = Pattern.compile(pattern); + } + + /** + * Returns the affix flag + * + * @return Affix flag + */ + public char getFlag() { + return flag; + } + + /** + * Sets the affix flag + * + * @param flag Affix flag + */ + public void setFlag(char flag) { + this.flag = flag; + } + + /** + * Returns whether the affix is defined as cross product + * + * @return {@code true} if the affix is cross product, {@code false} otherwise + */ + public boolean isCrossProduct() { + return crossProduct; + } + + /** + * Sets whether the affix is defined as cross product + * + * @param crossProduct Whether the affix is defined as cross product + */ + public void setCrossProduct(boolean crossProduct) { + this.crossProduct = crossProduct; + } +} Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package.html (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package.html (revision ) @@ -0,0 +1,26 @@ + + + +Stemming TokenFilter using a Java implementation of the +Hunspell stemming algorithm. +

+Dictionaries can be found on +OpenOffice's wiki +

+ + Index: modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (revision ) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (revision ) @@ -0,0 +1,411 @@ +package org.apache.lucene.analysis.hunspell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.util.Version; + +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HunspellDictionary { + + static final HunspellWord NOFLAGS = new HunspellWord(); + + private static final String PREFIX_KEY = "PFX"; + private static final String SUFFIX_KEY = "SFX"; + private static final String FLAG_KEY = "FLAG"; + + private static final String NUM_FLAG_TYPE = "num"; + private static final String UTF8_FLAG_TYPE = "UTF-8"; + private static final String LONG_FLAG_TYPE = "long"; + + private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; + private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; + + private CharArrayMap> words; + private CharArrayMap> prefixes; + private CharArrayMap> suffixes; + + private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy + private final Version version; + + /** + * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files + * + * @param affix InputStream for reading the hunspell affix file + * @param dictionary InputStream for reading the hunspell dictionary file + * @param version Lucene Version + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException { + this(affix, Arrays.asList(dictionary), version); + } + + /** + * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files + * + * @param affix InputStream for reading the hunspell affix file + * @param dictionaries InputStreams for reading the hunspell dictionary file + * @param version Lucene Version + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public HunspellDictionary(InputStream affix, List dictionaries, Version version) throws IOException, ParseException { + this.version = version; + String encoding = getDictionaryEncoding(affix); + CharsetDecoder decoder = getJavaEncoding(encoding); + readAffixFile(affix, decoder); + words = new CharArrayMap>(version, 65535 /* guess */, false); + for (InputStream dictionary : dictionaries) { + readDictionaryFile(dictionary, decoder); + } + } + + /** + * Looks up HunspellWords that match the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellWords that match the generated String, or {@code null} if none are found + */ + public List lookupWord(char word[], int offset, int length) { + return words.get(word, offset, length); + } + + /** + * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found + */ + public List lookupPrefix(char word[], int offset, int length) { + return prefixes.get(word, offset, length); + } + + /** + * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found + */ + public List lookupSuffix(char word[], int offset, int length) { + return suffixes.get(word, offset, length); + } + + /** + * Reads the affix file through the provided InputStream, building up the prefix and suffix maps + * + * @param affixStream InputStream to read the content of the affix file from + * @param decoder CharsetDecoder to decode the content of the file + * @throws IOException Can be thrown while reading from the InputStream + */ + private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException { + prefixes = new CharArrayMap>(version, 8, false); + suffixes = new CharArrayMap>(version, 8, false); + + BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder)); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.startsWith(PREFIX_KEY)) { + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(SUFFIX_KEY)) { + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(FLAG_KEY)) { + // Assume that the FLAG line comes before any prefix or suffixes + // Store the strategy so it can be used when parsing the dic file + flagParsingStrategy = getFlagParsingStrategy(line); + } + } + reader.close(); + } + + /** + * Parses a specific affix rule putting the result into the provided affix map + * + * @param affixes Map where the result of the parsing will be put + * @param header Header line of the affix rule + * @param reader BufferedReader to read the content of the rule from + * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex + * pattern + * @throws IOException Can be thrown while reading the rule + */ + private void parseAffix(CharArrayMap> affixes, + String header, + BufferedReader reader, + String conditionPattern) throws IOException { + String args[] = header.split("\\s+"); + + boolean crossProduct = args[2].equals("Y"); + + int numLines = Integer.parseInt(args[3]); + for (int i = 0; i < numLines; i++) { + String line = reader.readLine(); + String ruleArgs[] = line.split("\\s+"); + + HunspellAffix affix = new HunspellAffix(); + + affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); + affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); + + String affixArg = ruleArgs[3]; + + int flagSep = affixArg.lastIndexOf('/'); + if (flagSep != -1) { + char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1)); + Arrays.sort(appendFlags); + affix.setAppendFlags(appendFlags); + affix.setAppend(affixArg.substring(0, flagSep)); + } else { + affix.setAppend(affixArg); + } + + String condition = ruleArgs[4]; + affix.setCondition(condition, String.format(conditionPattern, condition)); + affix.setCrossProduct(crossProduct); + + List list = affixes.get(affix.getAppend()); + if (list == null) { + list = new ArrayList(); + affixes.put(affix.getAppend(), list); + } + + list.add(affix); + } + } + + /** + * Parses the encoding specificed in the affix file readable through the provided InputStream + * + * @param affix InputStream for reading the affix file + * @return Encoding specified in the affix file + * @throws IOException Can be thrown while reading from the InputStream + * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } + */ + private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { + final StringBuilder encoding = new StringBuilder(); + for (;;) { + encoding.setLength(0); + int ch; + while ((ch = affix.read()) >= 0) { + if (ch == '\n') { + break; + } + if (ch != '\r') { + encoding.append((char)ch); + } + } + if ( + encoding.length() == 0 || encoding.charAt(0) == '#' || + // this test only at the end as ineffective but would allow lines only containing spaces: + encoding.toString().trim().length() == 0 + ) { + if (ch < 0) { + throw new ParseException("Unexpected end of affix file.", 0); + } + continue; + } + if ("SET ".equals(encoding.substring(0, 4))) { + // cleanup the encoding string, too (whitespace) + return encoding.substring(4).trim(); + } + throw new ParseException("The first non-comment line in the affix file must "+ + "be a 'SET charset', was: '" + encoding +"'", 0); + } + } + + /** + * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and + * MICROSOFT-CP1251 etc are allowed... + * + * @param encoding Encoding to retrieve the CharsetDecoder for + * @return CharSetDecoder for the given encoding + */ + private CharsetDecoder getJavaEncoding(String encoding) { + Charset charset = Charset.forName(encoding); + return charset.newDecoder(); + } + + /** + * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file + * + * @param flagLine Line containing the flag information + * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton + */ + private FlagParsingStrategy getFlagParsingStrategy(String flagLine) { + String flagType = flagLine.substring(5); + + if (NUM_FLAG_TYPE.equals(flagType)) { + return new NumFlagParsingStrategy(); + } else if (UTF8_FLAG_TYPE.equals(flagType)) { + return new SimpleFlagParsingStrategy(); + } else if (LONG_FLAG_TYPE.equals(flagType)) { + return new DoubleASCIIFlagParsingStrategy(); + } + + throw new IllegalArgumentException("Unknown flag type: " + flagType); + } + + /** + * Reads the dictionary file through the provided InputStream, building up the words map + * + * @param dictionary InputStream to read the dictionary file through + * @param decoder CharsetDecoder used to decode the contents of the file + * @throws IOException Can be thrown while reading from the file + */ + private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); + // nocommit, don't create millions of strings. + String line = reader.readLine(); // first line is number of entries + int numEntries = Integer.parseInt(line); + + // nocommit, the flags themselves can be double-chars (long) or also numeric + // either way the trick is to encode them as char... but they must be parsed differently + while ((line = reader.readLine()) != null) { + String entry; + HunspellWord wordForm; + + int flagSep = line.lastIndexOf('/'); + if (flagSep == -1) { + wordForm = NOFLAGS; + entry = line; + } else { + // note, there can be comments (morph description) after a flag. + // we should really look for any whitespace + int end = line.indexOf('\t', flagSep); + if (end == -1) + end = line.length(); + + + wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end))); + Arrays.sort(wordForm.getFlags()); + entry = line.substring(0, flagSep); + } + + List entries = words.get(entry); + if (entries == null) { + entries = new ArrayList(); + words.put(entry, entries); + } + entries.add(wordForm); + } + } + + public Version getVersion() { + return version; + } + + /** + * Abstraction of the process of parsing flags taken from the affix and dic files + */ + private static abstract class FlagParsingStrategy { + + /** + * Parses the given String into a single flag + * + * @param rawFlag String to parse into a flag + * @return Parsed flag + */ + char parseFlag(String rawFlag) { + return parseFlags(rawFlag)[0]; + } + + /** + * Parses the given String into multiple flags + * + * @param rawFlags String to parse into flags + * @return Parsed flags + */ + abstract char[] parseFlags(String rawFlags); + } + + /** + * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. + * Can be used with both the ASCII and UTF-8 flag types. + */ + private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { + /** + * {@inheritDoc} + */ + public char[] parseFlags(String rawFlags) { + return rawFlags.toCharArray(); + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case + * of multiple flags, each number is separated by a comma. + */ + private static class NumFlagParsingStrategy extends FlagParsingStrategy { + /** + * {@inheritDoc} + */ + public char[] parseFlags(String rawFlags) { + String[] rawFlagParts = rawFlags.trim().split(","); + char[] flags = new char[rawFlagParts.length]; + + for (int i = 0; i < rawFlagParts.length; i++) { + // note, removing the trailing X/leading I for nepali... what is the rule here?! + flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", "")); + } + + return flags; + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes + * must be combined into a single character. + * + * TODO (rmuir) test + */ + private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { + + /** + * {@inheritDoc} + */ + public char[] parseFlags(String rawFlags) { + if (rawFlags.length() == 0) { + return new char[0]; + } + + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < rawFlags.length(); i+=2) { + char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1)); + builder.append(cookedFlag); + } + + char flags[] = new char[builder.length()]; + builder.getChars(0, builder.length(), flags, 0); + return flags; + } + } +}