Index: solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java (revision 940315) +++ solr/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java (working copy) @@ -23,7 +23,7 @@ import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; Index: solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java =================================================================== --- solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java (revision 940315) +++ solr/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java (working copy) @@ -23,7 +23,7 @@ import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; Index: solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java =================================================================== --- solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (revision 940315) +++ solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (working copy) @@ -190,7 +190,7 @@ assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, null, false)); - tokenList = (List) queryResult.get("org.apache.lucene.analysis.PorterStemFilter"); + tokenList = (List) queryResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jump", null, "", 0, 7, 1, null, false)); @@ -231,7 +231,7 @@ assertToken(tokenList.get(1), new TokenInfo("jumped", null, "", 8, 14, 2, null, false)); assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, null, false)); assertToken(tokenList.get(3), new TokenInfo("dogs", null, "", 24, 28, 4, null, false)); - tokenList = valueResult.get("org.apache.lucene.analysis.PorterStemFilter"); + tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens", 4, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, null, false)); Index: solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java =================================================================== --- solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (revision 940315) +++ solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (working copy) @@ -173,7 +173,7 @@ assertToken(tokenList.get(5), new TokenInfo("lazy", null, "", 34, 38, 6, null, false)); assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, null, true)); assertToken(tokenList.get(7), new TokenInfo("dogs", null, "", 45, 49, 8, null, false)); - tokenList = indexPart.get("org.apache.lucene.analysis.PorterStemFilter"); + tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 8); assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, null, false)); @@ -208,7 +208,7 @@ assertEquals(2, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); - tokenList = queryPart.get("org.apache.lucene.analysis.PorterStemFilter"); + tokenList = queryPart.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); Index: solr/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java =================================================================== --- solr/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java (revision 940315) +++ solr/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java (working copy) @@ -18,7 +18,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; /** * @version $Id$ Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 940315) +++ lucene/CHANGES.txt (working copy) @@ -4,6 +4,10 @@ Changes in backwards compatibility policy +* LUCENE-2413: Consolidated all Lucene analyzers into contrib/analyzers. + - o.a.l.analysis.PorterStemFilter -> o.a.l.analysis.en.PorterStemFilter + ... (in progress) + * LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing: - On upgrading to 3.1, if you do not fully reindex your documents, Index: lucene/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java (revision 940315) +++ lucene/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java (working copy) @@ -1,65 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.StringReader; -import java.util.zip.ZipFile; - -/** - * Test the PorterStemFilter with Martin Porter's test data. - */ -public class TestPorterStemFilter extends BaseTokenStreamTestCase { - /** - * Run the stemmer against all strings in voc.txt - * The output should be the same as the string in output.txt - */ - public void testPorterStemFilter() throws Exception { - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); - TokenStream filter = new PorterStemFilter(tokenizer); - ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip")); - InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt")); - InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt")); - BufferedReader vocReader = new BufferedReader(new InputStreamReader( - voc, "UTF-8")); - BufferedReader outputReader = new BufferedReader(new InputStreamReader( - out, "UTF-8")); - String inputWord = null; - while ((inputWord = vocReader.readLine()) != null) { - String expectedWord = outputReader.readLine(); - assertNotNull(expectedWord); - tokenizer.reset(new StringReader(inputWord)); - filter.reset(); - assertTokenStreamContents(filter, new String[] { expectedWord }); - } - vocReader.close(); - outputReader.close(); - zipFile.close(); - } - - public void testWithKeywordAttribute() throws IOException { - CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); - set.add("yourselves"); - Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("yourselves yours")); - TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set)); - assertTokenStreamContents(filter, new String[] {"yourselves", "your"}); - } -} Index: lucene/src/test/org/apache/lucene/analysis/porterTestData.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java (revision 940315) +++ lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java (working copy) @@ -1,547 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - - Porter stemmer in Java. The original paper is in - - Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - no. 3, pp 130-137, - - See also http://www.tartarus.org/~martin/PorterStemmer/index.html - - Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. - Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] - is then out outside the bounds of b. - - Similarly, - - Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. - 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and - b[j] is then outside the bounds of b. - - Release 3. - - [ This version is derived from Release 3, modified by Brian Goetz to - optimize for fewer object creations. ] - -*/ - - -import java.io.IOException; -import java.io.InputStream; -import java.io.FileInputStream; - -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; -import org.apache.lucene.util.ArrayUtil; - -/** - * - * Stemmer, implementing the Porter Stemming Algorithm - * - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. - */ - -class PorterStemmer -{ - private char[] b; - private int i, /* offset into b */ - j, k, k0; - private boolean dirty = false; - private static final int INITIAL_SIZE = 50; - - public PorterStemmer() { - b = new char[INITIAL_SIZE]; - i = 0; - } - - /** - * reset() resets the stemmer so it can stem another word. If you invoke - * the stemmer by calling add(char) and then stem(), you must call reset() - * before starting another word. - */ - public void reset() { i = 0; dirty = false; } - - /** - * Add a character to the word being stemmed. When you are finished - * adding characters, you can call stem(void) to process the word. - */ - public void add(char ch) { - if (b.length <= i) { - b = ArrayUtil.grow(b, i+1); - } - b[i++] = ch; - } - - /** - * After a word has been stemmed, it can be retrieved by toString(), - * or a reference to the internal buffer can be retrieved by getResultBuffer - * and getResultLength (which is generally more efficient.) - */ - @Override - public String toString() { return new String(b,0,i); } - - /** - * Returns the length of the word resulting from the stemming process. - */ - public int getResultLength() { return i; } - - /** - * Returns a reference to a character buffer containing the results of - * the stemming process. You also need to consult getResultLength() - * to determine the length of the result. - */ - public char[] getResultBuffer() { return b; } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private final boolean cons(int i) { - switch (b[i]) { - case 'a': case 'e': case 'i': case 'o': case 'u': - return false; - case 'y': - return (i==k0) ? true : !cons(i-1); - default: - return true; - } - } - - /* m() measures the number of consonant sequences between k0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - gives 0 - vc gives 1 - vcvc gives 2 - vcvcvc gives 3 - .... - */ - - private final int m() { - int n = 0; - int i = k0; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - while(true) { - while(true) { - if (i > j) - return n; - if (cons(i)) - break; - i++; - } - i++; - n++; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> k0,...j contains a vowel */ - - private final boolean vowelinstem() { - int i; - for (i = k0; i <= j; i++) - if (! cons(i)) - return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private final boolean doublec(int j) { - if (j < k0+1) - return false; - if (b[j] != b[j-1]) - return false; - return cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private final boolean cvc(int i) { - if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) - return false; - else { - int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; - } - - private final boolean ends(String s) { - int l = s.length(); - int o = k-l+1; - if (o < k0) - return false; - for (int i = 0; i < l; i++) - if (b[o+i] != s.charAt(i)) - return false; - j = k-l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - void setto(String s) { - int l = s.length(); - int o = j+1; - for (int i = 0; i < l; i++) - b[o+i] = s.charAt(i); - k = j+l; - dirty = true; - } - - /* r(s) is used further down. */ - - void r(String s) { if (m() > 0) setto(s); } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private final void step1() { - if (b[k] == 's') { - if (ends("sses")) k -= 2; - else if (ends("ies")) setto("i"); - else if (b[k-1] != 's') k--; - } - if (ends("eed")) { - if (m() > 0) - k--; - } - else if ((ends("ed") || ends("ing")) && vowelinstem()) { - k = j; - if (ends("at")) setto("ate"); - else if (ends("bl")) setto("ble"); - else if (ends("iz")) setto("ize"); - else if (doublec(k)) { - int ch = b[k--]; - if (ch == 'l' || ch == 's' || ch == 'z') - k++; - } - else if (m() == 1 && cvc(k)) - setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private final void step2() { - if (ends("y") && vowelinstem()) { - b[k] = 'i'; - dirty = true; - } - } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private final void step3() { - if (k == k0) return; /* For Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("ational")) { r("ate"); break; } - if (ends("tional")) { r("tion"); break; } - break; - case 'c': - if (ends("enci")) { r("ence"); break; } - if (ends("anci")) { r("ance"); break; } - break; - case 'e': - if (ends("izer")) { r("ize"); break; } - break; - case 'l': - if (ends("bli")) { r("ble"); break; } - if (ends("alli")) { r("al"); break; } - if (ends("entli")) { r("ent"); break; } - if (ends("eli")) { r("e"); break; } - if (ends("ousli")) { r("ous"); break; } - break; - case 'o': - if (ends("ization")) { r("ize"); break; } - if (ends("ation")) { r("ate"); break; } - if (ends("ator")) { r("ate"); break; } - break; - case 's': - if (ends("alism")) { r("al"); break; } - if (ends("iveness")) { r("ive"); break; } - if (ends("fulness")) { r("ful"); break; } - if (ends("ousness")) { r("ous"); break; } - break; - case 't': - if (ends("aliti")) { r("al"); break; } - if (ends("iviti")) { r("ive"); break; } - if (ends("biliti")) { r("ble"); break; } - break; - case 'g': - if (ends("logi")) { r("log"); break; } - } - } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private final void step4() { - switch (b[k]) { - case 'e': - if (ends("icate")) { r("ic"); break; } - if (ends("ative")) { r(""); break; } - if (ends("alize")) { r("al"); break; } - break; - case 'i': - if (ends("iciti")) { r("ic"); break; } - break; - case 'l': - if (ends("ical")) { r("ic"); break; } - if (ends("ful")) { r(""); break; } - break; - case 's': - if (ends("ness")) { r(""); break; } - break; - } - } - - /* step5() takes off -ant, -ence etc., in context vcvc. */ - - private final void step5() { - if (k == k0) return; /* for Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("al")) break; - return; - case 'c': - if (ends("ance")) break; - if (ends("ence")) break; - return; - case 'e': - if (ends("er")) break; return; - case 'i': - if (ends("ic")) break; return; - case 'l': - if (ends("able")) break; - if (ends("ible")) break; return; - case 'n': - if (ends("ant")) break; - if (ends("ement")) break; - if (ends("ment")) break; - /* element etc. not stripped before the m */ - if (ends("ent")) break; - return; - case 'o': - if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; - /* j >= 0 fixes Bug 2 */ - if (ends("ou")) break; - return; - /* takes care of -ous */ - case 's': - if (ends("ism")) break; - return; - case 't': - if (ends("ate")) break; - if (ends("iti")) break; - return; - case 'u': - if (ends("ous")) break; - return; - case 'v': - if (ends("ive")) break; - return; - case 'z': - if (ends("ize")) break; - return; - default: - return; - } - if (m() > 1) - k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private final void step6() { - j = k; - if (b[k] == 'e') { - int a = m(); - if (a > 1 || a == 1 && !cvc(k-1)) - k--; - } - if (b[k] == 'l' && doublec(k) && m() > 1) - k--; - } - - - /** - * Stem a word provided as a String. Returns the result as a String. - */ - public String stem(String s) { - if (stem(s.toCharArray(), s.length())) - return toString(); - else - return s; - } - - /** Stem a word contained in a char[]. Returns true if the stemming process - * resulted in a word different from the input. You can retrieve the - * result with getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word) { - return stem(word, word.length); - } - - /** Stem a word contained in a portion of a char[] array. Returns - * true if the stemming process resulted in a word different from - * the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] wordBuffer, int offset, int wordLen) { - reset(); - if (b.length < wordLen) { - b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; - } - System.arraycopy(wordBuffer, offset, b, 0, wordLen); - i = wordLen; - return stem(0); - } - - /** Stem a word contained in a leading portion of a char[] array. - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word, int wordLen) { - return stem(word, 0, wordLen); - } - - /** Stem the word placed into the Stemmer buffer through calls to add(). - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem() { - return stem(0); - } - - public boolean stem(int i0) { - k = i - 1; - k0 = i0; - if (k > k0+1) { - step1(); step2(); step3(); step4(); step5(); step6(); - } - // Also, a word is considered dirty if we lopped off letters - // Thanks to Ifigenia Vairelles for pointing this out. - if (i != k+1) - dirty = true; - i = k+1; - return dirty; - } - - /** Test program for demonstrating the Stemmer. It reads a file and - * stems each word, writing the result to standard out. - * Usage: Stemmer file-name - */ - public static void main(String[] args) { - PorterStemmer s = new PorterStemmer(); - - for (int i = 0; i < args.length; i++) { - try { - InputStream in = new FileInputStream(args[i]); - byte[] buffer = new byte[1024]; - int bufferLen, offset, ch; - - bufferLen = in.read(buffer); - offset = 0; - s.reset(); - - while(true) { - if (offset < bufferLen) - ch = buffer[offset++]; - else { - bufferLen = in.read(buffer); - offset = 0; - if (bufferLen < 0) - ch = -1; - else - ch = buffer[offset++]; - } - - if (Character.isLetter((char) ch)) { - s.add(Character.toLowerCase((char) ch)); - } - else { - s.stem(); - System.out.print(s.toString()); - s.reset(); - if (ch < 0) - break; - else { - System.out.print((char) ch); - } - } - } - - in.close(); - } - catch (IOException e) { - System.out.println("error reading " + args[i]); - } - } - } -} - Index: lucene/src/java/org/apache/lucene/analysis/PorterStemFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 940315) +++ lucene/src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy) @@ -1,67 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -/** Transforms the token stream as per the Porter stemming algorithm. - Note: the input to the stemming filter must already be in lower case, - so you will need to use LowerCaseFilter or LowerCaseTokenizer farther - down the Tokenizer chain in order for this to work properly! -

- To use this filter with other analyzers, you'll want to write an - Analyzer class that sets up the TokenStream chain as you want it. - To use this with LowerCaseTokenizer, for example, you'd write an - analyzer like this: -

-

-    class MyAnalyzer extends Analyzer {
-      public final TokenStream tokenStream(String fieldName, Reader reader) {
-        return new PorterStemFilter(new LowerCaseTokenizer(reader));
-      }
-    }
-    
-

- Note: This filter is aware of the {@link KeywordAttribute}. To prevent - certain terms from being passed to the stemmer - {@link KeywordAttribute#isKeyword()} should be set to true - in a previous {@link TokenStream}. -

-*/ -public final class PorterStemFilter extends TokenFilter { - private final PorterStemmer stemmer = new PorterStemmer(); - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); - - public PorterStemFilter(TokenStream in) { - super(in); - } - - @Override - public final boolean incrementToken() throws IOException { - if (!input.incrementToken()) - return false; - - if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.buffer(), 0, termAtt.length())) - termAtt.copyBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); - return true; - } -} Index: lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/AnalyzerUtil.java =================================================================== --- lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/AnalyzerUtil.java (revision 940315) +++ lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/AnalyzerUtil.java (working copy) @@ -1,464 +0,0 @@ -package org.apache.lucene.wordnet; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.PrintStream; -import java.io.Reader; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.PorterStemFilter; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; - -/** - * Various fulltext analysis utilities avoiding redundant code in several - * classes. - * - */ -public class AnalyzerUtil { - - private AnalyzerUtil() {} - - /** - * Returns a simple analyzer wrapper that logs all tokens produced by the - * underlying child analyzer to the given log stream (typically System.err); - * Otherwise behaves exactly like the child analyzer, delivering the very - * same tokens; useful for debugging purposes on custom indexing and/or - * querying. - * - * @param child - * the underlying child analyzer - * @param log - * the print stream to log to (typically System.err) - * @param logName - * a name for this logger (typically "log" or similar) - * @return a logging analyzer - */ - public static Analyzer getLoggingAnalyzer(final Analyzer child, - final PrintStream log, final String logName) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (log == null) - throw new IllegalArgumentException("logStream must not be null"); - - return new Analyzer() { - @Override - public TokenStream tokenStream(final String fieldName, Reader reader) { - return new TokenFilter(child.tokenStream(fieldName, reader)) { - private int position = -1; - private TermAttribute termAtt = addAttribute(TermAttribute.class); - private PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - - @Override - public boolean incrementToken() throws IOException { - boolean hasNext = input.incrementToken(); - log.println(toString(hasNext)); - return hasNext; - } - - private String toString(boolean hasNext) { - if (!hasNext) return "[" + logName + ":EOS:" + fieldName + "]\n"; - - position += posIncrAtt.getPositionIncrement(); - return "[" + logName + ":" + position + ":" + fieldName + ":" - + termAtt.term() + ":" + offsetAtt.startOffset() - + "-" + offsetAtt.endOffset() + ":" + typeAtt.type() - + "]"; - } - }; - } - }; - } - - - /** - * Returns an analyzer wrapper that returns at most the first - * maxTokens tokens from the underlying child analyzer, - * ignoring all remaining tokens. - * - * @param child - * the underlying child analyzer - * @param maxTokens - * the maximum number of tokens to return from the underlying - * analyzer (a value of Integer.MAX_VALUE indicates unlimited) - * @return an analyzer wrapper - */ - public static Analyzer getMaxTokenAnalyzer( - final Analyzer child, final int maxTokens) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (maxTokens < 0) - throw new IllegalArgumentException("maxTokens must not be negative"); - if (maxTokens == Integer.MAX_VALUE) - return child; // no need to wrap - - return new Analyzer() { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new TokenFilter(child.tokenStream(fieldName, reader)) { - private int todo = maxTokens; - - @Override - public boolean incrementToken() throws IOException { - return --todo >= 0 ? input.incrementToken() : false; - } - }; - } - }; - } - - - /** - * Returns an English stemming analyzer that stems tokens from the - * underlying child analyzer according to the Porter stemming algorithm. The - * child analyzer must deliver tokens in lower case for the stemmer to work - * properly. - *

- * Background: Stemming reduces token terms to their linguistic root form - * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to - * "famili", as well as "complete" and "completion" to "complet". Note that - * the root form is not necessarily a meaningful word in itself, and that - * this is not a bug but rather a feature, if you lean back and think about - * fuzzy word matching for a bit. - *

- * See the Lucene contrib packages for stemmers (and stop words) for German, - * Russian and many more languages. - * - * @param child - * the underlying child analyzer - * @return an analyzer wrapper - */ - public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - - return new Analyzer() { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new PorterStemFilter( - child.tokenStream(fieldName, reader)); -// /* PorterStemFilter and SnowballFilter have the same behaviour, -// but PorterStemFilter is much faster. */ -// return new org.apache.lucene.analysis.snowball.SnowballFilter( -// child.tokenStream(fieldName, reader), "English"); - } - }; - } - - - /** - * Returns an analyzer wrapper that wraps the underlying child analyzer's - * token stream into a {@link SynonymTokenFilter}. - * - * @param child - * the underlying child analyzer - * @param synonyms - * the map used to extract synonyms for terms - * @param maxSynonyms - * the maximum number of synonym tokens to return per underlying - * token word (a value of Integer.MAX_VALUE indicates unlimited) - * @return a new analyzer - */ - public static Analyzer getSynonymAnalyzer(final Analyzer child, - final SynonymMap synonyms, final int maxSynonyms) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (synonyms == null) - throw new IllegalArgumentException("synonyms must not be null"); - if (maxSynonyms < 0) - throw new IllegalArgumentException("maxSynonyms must not be negative"); - if (maxSynonyms == 0) - return child; // no need to wrap - - return new Analyzer() { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new SynonymTokenFilter( - child.tokenStream(fieldName, reader), synonyms, maxSynonyms); - } - }; - } - - - /** - * Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's - * token streams, and delivers those cached tokens on subsequent calls to - * tokenStream(String fieldName, Reader reader) - * if the fieldName has been seen before, altogether ignoring the Reader parameter on cache lookup. - *

- * If Analyzer / TokenFilter chains are expensive in terms of I/O or CPU, such caching can - * help improve performance if the same document is added to multiple Lucene indexes, - * because the text analysis phase need not be performed more than once. - *

- * Caveats: - *

    - *
  • Caching the tokens of large Lucene documents can lead to out of memory exceptions.
  • - *
  • The Token instances delivered by the underlying child analyzer must be immutable.
  • - *
  • The same caching analyzer instance must not be used for more than one document - * because the cache is not keyed on the Reader parameter.
  • - *
- * - * @param child - * the underlying child analyzer - * @return a new analyzer - */ - public static Analyzer getTokenCachingAnalyzer(final Analyzer child) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - - return new Analyzer() { - - private final HashMap> cache = new HashMap>(); - - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - final ArrayList tokens = cache.get(fieldName); - if (tokens == null) { // not yet cached - final ArrayList tokens2 = new ArrayList(); - TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) { - - @Override - public boolean incrementToken() throws IOException { - boolean hasNext = input.incrementToken(); - if (hasNext) tokens2.add(captureState()); - return hasNext; - } - }; - - cache.put(fieldName, tokens2); - return tokenStream; - } else { // already cached - return new TokenStream() { - - private Iterator iter = tokens.iterator(); - - @Override - public boolean incrementToken() { - if (!iter.hasNext()) return false; - restoreState(iter.next()); - return true; - } - }; - } - } - }; - } - - - /** - * Returns (frequency:term) pairs for the top N distinct terms (aka words), - * sorted descending by frequency (and ascending by term, if tied). - *

- * Example XQuery: - *

-   * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
-   * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
-   * 
-   * for $pair in util:get-most-frequent-terms(
-   *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
-   * return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
-   * 
- * - * @param analyzer - * the analyzer to use for splitting text into terms (aka words) - * @param text - * the text to analyze - * @param limit - * the maximum number of pairs to return; zero indicates - * "as many as possible". - * @return an array of (frequency:term) pairs in the form of (freq0:term0, - * freq1:term1, ..., freqN:termN). Each pair is a single string - * separated by a ':' delimiter. - */ - public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) { - if (analyzer == null) - throw new IllegalArgumentException("analyzer must not be null"); - if (text == null) - throw new IllegalArgumentException("text must not be null"); - if (limit <= 0) limit = Integer.MAX_VALUE; - - // compute frequencies of distinct terms - HashMap map = new HashMap(); - TokenStream stream = analyzer.tokenStream("", new StringReader(text)); - TermAttribute termAtt = stream.addAttribute(TermAttribute.class); - try { - while (stream.incrementToken()) { - MutableInteger freq = map.get(termAtt.term()); - if (freq == null) { - freq = new MutableInteger(1); - map.put(termAtt.term(), freq); - } else { - freq.setValue(freq.intValue() + 1); - } - } - } catch (IOException e) { - throw new RuntimeException(e); - } finally { - try { - stream.close(); - } catch (IOException e2) { - throw new RuntimeException(e2); - } - } - - // sort by frequency, text - Map.Entry[] entries = new Map.Entry[map.size()]; - map.entrySet().toArray(entries); - Arrays.sort(entries, new Comparator>() { - public int compare(Map.Entry e1, Map.Entry e2) { - int f1 = e1.getValue().intValue(); - int f2 = e2.getValue().intValue(); - if (f2 - f1 != 0) return f2 - f1; - String s1 = e1.getKey(); - String s2 = e2.getKey(); - return s1.compareTo(s2); - } - }); - - // return top N entries - int size = Math.min(limit, entries.length); - String[] pairs = new String[size]; - for (int i=0; i < size; i++) { - pairs[i] = entries[i].getValue() + ":" + entries[i].getKey(); - } - return pairs; - } - - private static final class MutableInteger { - private int value; - public MutableInteger(int value) { this.value = value; } - public int intValue() { return value; } - public void setValue(int value) { this.value = value; } - @Override - public String toString() { return String.valueOf(value); } - } - - - - // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/ - /** (Line terminator followed by zero or more whitespace) two or more times */ - private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}"); - - /** - * Returns at most the first N paragraphs of the given text. Delimiting - * characters are excluded from the results. Each returned paragraph is - * whitespace-trimmed via String.trim(), potentially an empty string. - * - * @param text - * the text to tokenize into paragraphs - * @param limit - * the maximum number of paragraphs to return; zero indicates "as - * many as possible". - * @return the first N paragraphs - */ - public static String[] getParagraphs(String text, int limit) { - return tokenize(PARAGRAPHS, text, limit); - } - - private static String[] tokenize(Pattern pattern, String text, int limit) { - String[] tokens = pattern.split(text, limit); - for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim(); - return tokens; - } - - - // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.') - /** Divides text into sentences; Includes inverted spanish exclamation and question mark */ -// private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+"); - - /** - * Returns at most the first N sentences of the given text. Delimiting - * characters are excluded from the results. Each returned sentence is - * whitespace-trimmed via String.trim(), potentially an empty string. - * - * @param text - * the text to tokenize into sentences - * @param limit - * the maximum number of sentences to return; zero indicates "as - * many as possible". - * @return the first N sentences - */ - public static String[] getSentences(String text, int limit) { -// return tokenize(SENTENCES, text, limit); // equivalent but slower - int len = text.length(); - if (len == 0) return new String[] { text }; - if (limit <= 0) limit = Integer.MAX_VALUE; - - // average sentence length heuristic - String[] tokens = new String[Math.min(limit, 1 + len/40)]; - int size = 0; - int i = 0; - - while (i < len && size < limit) { - - // scan to end of current sentence - int start = i; - while (i < len && !isSentenceSeparator(text.charAt(i))) i++; - - if (size == tokens.length) { // grow array - String[] tmp = new String[tokens.length << 1]; - System.arraycopy(tokens, 0, tmp, 0, size); - tokens = tmp; - } - // add sentence (potentially empty) - tokens[size++] = text.substring(start, i).trim(); - - // scan to beginning of next sentence - while (i < len && isSentenceSeparator(text.charAt(i))) i++; - } - - if (size == tokens.length) return tokens; - String[] results = new String[size]; - System.arraycopy(tokens, 0, results, 0, size); - return results; - } - - private static boolean isSentenceSeparator(char c) { - // regex [!\\.\\?\\xA1\\xBF] - switch (c) { - case '!': return true; - case '.': return true; - case '?': return true; - case 0xA1: return true; // spanish inverted exclamation mark - case 0xBF: return true; // spanish inverted question mark - default: return false; - } - } - -} Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 940315) +++ lucene/contrib/CHANGES.txt (working copy) @@ -23,6 +23,11 @@ * LUCENE-2323: Moved contrib/wikipedia functionality into contrib/analyzers. Additionally the package was changed from org.apache.lucene.wikipedia.analysis to org.apache.lucene.analysis.wikipedia. (Robert Muir) + + * LUCENE-2413: Consolidated all analyzers into contrib/analyzers. + - contrib/analyzers/smartcn now depends on contrib/analyzers/common + - The "AnalyzerUtil" in wordnet was removed. + ... (in progress) Changes in runtime behavior Index: lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java =================================================================== --- lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (revision 940315) +++ lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java (working copy) @@ -25,7 +25,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; Index: lucene/contrib/analyzers/smartcn/build.xml =================================================================== --- lucene/contrib/analyzers/smartcn/build.xml (revision 940315) +++ lucene/contrib/analyzers/smartcn/build.xml (working copy) @@ -28,11 +28,27 @@ - + + + + + + + + + - + + + + + + Misc building dependency ${analyzers-common.jar} + + Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 940315) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (working copy) @@ -7,7 +7,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.KeywordTokenizer; -import org.apache.lucene.analysis.PorterStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java (revision 937139) +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.en; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -24,6 +24,14 @@ import java.io.StringReader; import java.util.zip.ZipFile; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.KeywordMarkerFilter; +import org.apache.lucene.analysis.KeywordTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; + /** * Test the PorterStemFilter with Martin Porter's test data. */ Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (revision 940315) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (working copy) @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.KeywordMarkerFilter; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java (revision 937139) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemmer.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.en; /** * Licensed to the Apache Software Foundation (ASF) under one or more Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java (revision 937139) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.en; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,6 +19,8 @@ import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;