Index: contrib/analysis/build.xml =================================================================== --- contrib/analysis/build.xml (revision 0) +++ contrib/analysis/build.xml (revision 0) @@ -0,0 +1,72 @@ + + + + + + + + Additional Analyzers + - analyzers: Additional Analyzers + - smartcn: Smart Analyzer for Simplified Chinese Text + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: contrib/analysis/analyzers/build.xml =================================================================== --- contrib/analysis/analyzers/build.xml (revision 791341) +++ contrib/analysis/analyzers/build.xml (working copy) @@ -25,6 +25,17 @@ + + + + - + + + + + + + + Index: contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html =================================================================== --- contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html (revision 791341) +++ contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html (working copy) @@ -1,5 +1,24 @@ - + + + + -Analyzer for Chinese, Japanese and Korean. +Analyzer for Chinese, Japanese, and Korean, which indexes bigrams (overlapping groups of two adjacent Han characters). +

+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. +

    +
  • ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in the analyzer-smartcn package): Index words (attempt to segment Chinese text into words) as tokens. +
+ +Example phrase: "我是中国人" +
    +
  1. ChineseAnalyzer: 我-是-中-国-人
  2. +
  3. CJKAnalyzer: 我是-是中-中国-国人
  4. +
  5. SmartChineseAnalyzer: 我-是-中国-人
  6. +
+

+ Index: contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html =================================================================== --- contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (revision 791341) +++ contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (working copy) @@ -1,13 +1,15 @@ - + + + -Analyzers for Chinese. +Analyzer for Chinese, which indexes unigrams (individuals chinese characters).

Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.

    -
  • ChineseAnalyzer: Index unigrams (individual Chinese characters) as a token. -
  • CJKAnalyzer: Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. -
  • SmartChineseAnalyzer: Index words (attempt to segment Chinese text into words) as tokens. +
  • ChineseAnalyzer (in this package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in the analyzer-smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
Example phrase: "我是中国人" Index: contrib/analysis/smartcn/build.xml =================================================================== --- contrib/analysis/smartcn/build.xml (revision 0) +++ contrib/analysis/smartcn/build.xml (revision 0) @@ -0,0 +1,41 @@ + + + + + + + + Smart Chinese Analyzer + + + + + + + + + + + + + + + + + + Index: contrib/analysis/smartcn/pom.xml.template =================================================================== --- contrib/analysis/smartcn/pom.xml.template (revision 0) +++ contrib/analysis/smartcn/pom.xml.template (revision 0) @@ -0,0 +1,35 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-analysis-smartcn + Lucene Smart Chinese Analyzer + @version@ + Smart Chinese Analyzer + jar + Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.io.UnsupportedEncodingException; @@ -27,7 +27,7 @@ * Contains methods for dealing with GB2312 encoding. *

*/ -public abstract class AbstractDictionary { +abstract class AbstractDictionary { /** * First Chinese Character in GB2312 (15 * 94) * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; import java.io.File; import java.io.FileInputStream; @@ -49,12 +49,12 @@ * * */ -public class AnalyzerProfile { +class AnalyzerProfile { /** * Global indicating the configured analysis data directory */ - public static String ANALYSIS_DATA_DIR = ""; + static String ANALYSIS_DATA_DIR = ""; static { init(); Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.util.ArrayList; import java.util.Collection; @@ -24,7 +24,6 @@ import java.util.List; import java.util.Map; -import org.apache.lucene.analysis.cn.smart.Utility; /** * Graph representing possible token pairs (bigrams) at each start offset in the sentence. @@ -32,7 +31,7 @@ * For each start offset, a list of possible token pairs is stored. *

*/ -public class BiSegGraph { +class BiSegGraph { private Map tokenPairListTable = new HashMap(); Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.io.File; import java.io.FileInputStream; @@ -30,12 +30,11 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; -import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; /** * SmartChineseAnalyzer Bigram dictionary. */ -public class BigramDictionary extends AbstractDictionary { +class BigramDictionary extends AbstractDictionary { private BigramDictionary() { } @@ -208,45 +207,6 @@ // log.info("load dictionary done! " + dctFilePath + " total:" + total); } - /* - * public void test(String dctFilePath) throws IOException { int i, cnt, - * length, total = 0; int corrupt = 0, notFound = 0; // - * 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3]; - * byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile = - * new RandomAccessFile(dctFilePath, "r"); - * - * // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i < - * GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr = - * getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i); - * - * dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big - * endian,必须转换过来 cnt = - * ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if - * (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) { - * dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order( - * ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer); - * buffer[1] = ByteBuffer.wrap(intBuffer).order( - * ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); // - * buffer[2] = ByteBuffer.wrap(intBuffer).order( // - * ByteOrder.LITTLE_ENDIAN).getInt();// handle - * - * length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length]; - * dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i - * != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char - * carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray); - * if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { // - * System.out.println("corrupt: " + tmpword + "<->" // + - * bigramStringTable[index]); // corrupt++; // } } else { - * System.out.println("not found: " + tmpword); notFound++; } } j++; } } - * dctFile.close(); System.out.println("num not found:" + notFound); - * System.out.println("num corrupt:" + corrupt); - * - * log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt = - * 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j] - * != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt); - * } - */ - private int getAvaliableIndex(long hashId, char carray[]) { int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH); int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH; @@ -307,13 +267,4 @@ return 0; } - public static void main(String[] args) throws FileNotFoundException, - UnsupportedEncodingException, IOException { - BigramDictionary dic = new BigramDictionary(); - dic.load("D:/analysis-data"); - // dic.test("D:/analysis-data/BigramDict.dct"); - System.out.println("max:" + dic.max); - System.out.println("average repeat:" + (double) dic.repeat / 328856); - System.out.println("end"); - } } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java (working copy) @@ -15,12 +15,12 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; /** * Internal SmartChineseAnalyzer character type constants. */ -public class CharType { +final class CharType { /** * Punctuation Characters Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java (working copy) @@ -15,18 +15,14 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.util.List; -import org.apache.lucene.analysis.cn.smart.CharType; -import org.apache.lucene.analysis.cn.smart.Utility; -import org.apache.lucene.analysis.cn.smart.WordType; - /** * Finds the optimal segmentation of a sentence into Chinese words */ -public class HHMMSegmenter { +class HHMMSegmenter { private static WordDictionary wordDict = WordDictionary.getInstance(); Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; /** * SmartChineseAnalyzer internal node representation @@ -23,7 +23,7 @@ * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm. *

*/ -public class PathNode implements Comparable { +class PathNode implements Comparable { public double weight; public int preNode; Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.util.ArrayList; import java.util.HashMap; @@ -29,7 +29,7 @@ * For each start offset, a list of possible tokens is stored. *

*/ -public class SegGraph { +class SegGraph { /** * Map of start offsets to ArrayList of tokens at that position Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java (working copy) @@ -15,32 +15,46 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; /** - * SmartChineseAnalyzer internal token + * SmartChineseAnalyzer internal token. */ -public class SegToken { - public char[] charArray; +class SegToken { + char[] charArray; - public int startOffset; + int startOffset; - public int endOffset; + int endOffset; - public int wordType; + int wordType; - public int weight; + int weight; - public int index; + int index; + /** + * Create a new SegToken from a {@link String} + * + * @param word + * @param start start offset of text + * @param end end offset of text + * @param wordType {@link WordType} of the text + * @param weight word frequency + */ public SegToken(String word, int start, int end, int wordType, int weight) { - this.charArray = word.toCharArray(); - this.startOffset = start; - this.endOffset = end; - this.wordType = wordType; - this.weight = weight; + this(word.toCharArray(), start, end, wordType, weight); } + /** + * Create a new SegToken from a character array. + * + * @param idArray character array containing text + * @param start start offset of text + * @param end end offset of text + * @param wordType {@link WordType} of the text + * @param weight word frequency + */ public SegToken(char[] idArray, int start, int end, int wordType, int weight) { this.charArray = idArray; this.startOffset = start; @@ -48,14 +62,4 @@ this.wordType = wordType; this.weight = weight; } - - // public String toString() { - // return String.valueOf(charArray) + "/s(" + startOffset + ")e(" - // + endOffset + ")/w(" + weight + ")t(" + wordType + ")"; - // } - - // public boolean equals(RawToken t) { - // return this.startOffset == t.startOffset - // && this.endOffset == t.endOffset; - // } } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java (working copy) @@ -15,10 +15,8 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; -import org.apache.lucene.analysis.cn.smart.Utility; -import org.apache.lucene.analysis.cn.smart.WordType; /** *

@@ -26,7 +24,7 @@ * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER} *

*/ -public class SegTokenFilter { +class SegTokenFilter { /** * Filter an input {@link SegToken} Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java (working copy) @@ -15,12 +15,12 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; /** * A pair of tokens in {@link SegGraph} */ -public class SegTokenPair { +class SegTokenPair { public char[] charArray; @@ -43,13 +43,4 @@ this.weight = weight; } - // public String toString() { - // return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):" - // + weight; - // } - - // public boolean equals(SegTokenPair tp) { - // return this.from == tp.from && this.to == tp.to; - // } - } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java (working copy) @@ -15,9 +15,8 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; -import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; @@ -25,35 +24,33 @@ import org.apache.lucene.analysis.Tokenizer; /** - * Tokenizes input into sentences. + * Tokenizes input text into sentences. + *

+ * The output tokens can then be broken into words with {@link WordTokenFilter} + *

*/ public class SentenceTokenizer extends Tokenizer { /** * End of sentence punctuation: 。,!?;,!?; */ - public final static String PUNCTION = "。,!?;,!?;"; + private final static String PUNCTION = "。,!?;,!?;"; private StringBuffer buffer = new StringBuffer(); - private BufferedReader bufferInput; - private int tokenStart = 0, tokenEnd = 0; - private Token t = new Token(); - public SentenceTokenizer(Reader reader) { super(reader); - bufferInput = new BufferedReader(reader, 2048); } - public Token next() throws IOException { + public Token next(final Token reusableToken) throws IOException { buffer.setLength(0); int ci; char ch, pch; boolean atBegin = true; tokenStart = tokenEnd; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; while (true) { @@ -67,14 +64,14 @@ } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) { tokenStart++; tokenEnd++; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; } else { buffer.append(ch); atBegin = false; tokenEnd++; pch = ch; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; // Two spaces, such as CR, LF if (Utility.SPACES.indexOf(ch) != -1 @@ -88,14 +85,10 @@ if (buffer.length() == 0) return null; else { - t.clear(); - t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); - return t; + reusableToken.clear(); + reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); + return reusableToken; } } - public void close() throws IOException { - bufferInput.close(); - } - } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (working copy) @@ -17,24 +17,19 @@ package org.apache.lucene.analysis.cn; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; -import org.apache.lucene.analysis.cn.smart.WordSegmenter; -import org.apache.lucene.analysis.cn.smart.WordTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WordlistLoader; -import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; // for javadoc - /** *

* SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text. @@ -47,19 +42,35 @@ *

*

* This analyzer requires a dictionary to provide statistical data. - * To specify the location of the dictionary data, refer to {@link AnalyzerProfile} + * SmartChineseAnalyzer has an included dictionary out-of-box. *

*

* The included dictionary data is from ICTCLAS1.0. * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License! *

+ *

+ * In special circumstances a user may wish to configure SmartChineseAnalyzer with a custom data directory location, containing bigramdict.dct and coredict.dct + *

+ * The following order is used to determine the location of the data directory: + * + *
    + *
  1. System property: -Danalysis.data.dir=/path/to/analysis-data
  2. + *
  3. Relative path: analysis-data
  4. + *
  5. Relative path: lib/analysis-data
  6. + *
  7. Property file: analysis.data.dir property from relative path analysis.properties
  8. + *
  9. Property file: analysis.data.dir property from relative path lib/analysis.properties
  10. + *
+ * + * Example property file: + * + *
+ * analysis.data.dir=D:/path/to/analysis-data/
+ * 
*/ public class SmartChineseAnalyzer extends Analyzer { private Set stopWords = null; - private WordSegmenter wordSegment; - /** * Create a new SmartChineseAnalyzer, using the default stopword list. */ @@ -80,10 +91,15 @@ */ public SmartChineseAnalyzer(boolean useDefaultStopWords) { if (useDefaultStopWords) { - stopWords = loadStopWords(this.getClass().getResourceAsStream( - "stopwords.txt")); + try { + InputStream stream = this.getClass().getResourceAsStream("stopwords.txt"); + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + stopWords = WordlistLoader.getWordSet(reader, "//"); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } } - wordSegment = new WordSegmenter(); } /** @@ -94,16 +110,14 @@ * Note: the set should include punctuation, unless you want to index punctuation! *

* @param stopWords {@link Set} of stopwords to use. - * @see SmartChineseAnalyzer#loadStopWords(InputStream) */ public SmartChineseAnalyzer(Set stopWords) { this.stopWords = stopWords; - wordSegment = new WordSegmenter(); } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new SentenceTokenizer(reader); - result = new WordTokenizer(result, wordSegment); + result = new WordTokenFilter(result); // result = new LowerCaseFilter(result); // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. // The porter stemming is too strict, this is not a bug, this is a feature:) @@ -113,37 +127,28 @@ } return result; } - - /** - * Utility function to return a {@link Set} of stopwords from a UTF-8 encoded {@link InputStream}. - * The comment "//" can be used in the stopword list. - * - * @param input {@link InputStream} of UTF-8 encoded stopwords - * @return {@link Set} of stopwords. - */ - public static Set loadStopWords(InputStream input) { - /* - * Note: WordListLoader is not used here because this method allows for inline "//" comments. - * WordListLoader will only filter out these comments if they are on a separate line. - */ - String line; - Set stopWords = new HashSet(); - try { - BufferedReader br = new BufferedReader(new InputStreamReader(input, - "UTF-8")); - while ((line = br.readLine()) != null) { - if (line.indexOf("//") != -1) { - line = line.substring(0, line.indexOf("//")); - } - line = line.trim(); - if (line.length() != 0) - stopWords.add(line.toLowerCase()); + + private static final class SavedStreams { + Tokenizer tokenStream; + TokenStream filteredTokenStream; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new SentenceTokenizer(reader); + streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream); + streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream); + if (stopWords != null) { + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false); } - br.close(); - } catch (IOException e) { - System.err.println("WARNING: cannot open stop words list!"); + } else { + streams.tokenStream.reset(reader); } - return stopWords; + + return streams.filteredTokenStream; } - } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java (working copy) @@ -15,15 +15,13 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; -import org.apache.lucene.analysis.cn.smart.hhmm.BiSegGraph; // for javadoc -import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc /** * SmartChineseAnalyzer utility constants and methods */ -public class Utility { +class Utility { public static final char[] STRING_CHAR_ARRAY = new String("未##串") .toCharArray(); Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart.hhmm; +package org.apache.lucene.analysis.cn; import java.io.File; import java.io.FileInputStream; @@ -30,14 +30,11 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; -import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; -import org.apache.lucene.analysis.cn.smart.Utility; - /** * SmartChineseAnalyzer Word Dictionary * */ -public class WordDictionary extends AbstractDictionary { +class WordDictionary extends AbstractDictionary { private WordDictionary() { } @@ -550,18 +547,4 @@ wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; } - public static void main(String[] args) throws FileNotFoundException, - IOException { - WordDictionary dic = new WordDictionary(); - dic.load("D:/analysis-data"); - Utility.getCharType('。'); - Utility.getCharType('汗'); - Utility.getCharType(' ');// 0020 - Utility.getCharType(' ');// 3000 - Utility.getCharType('');// E095 - Utility.getCharType(' ');// 3000 - Utility.getCharType('\r');// 000D - Utility.getCharType('\n');// 000A - Utility.getCharType('\t');// 0009 - } } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java (working copy) @@ -15,27 +15,24 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter; -import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; -import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; /** * Segment a sentence of Chinese text into words. */ -public class WordSegmenter { +class WordSegmenter { private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter(); private SegTokenFilter tokenFilter = new SegTokenFilter(); /** - * Segment a sentence into words with {@link HHMMSegmenter} + * Segment a sentence of Chinese text into words * * @param sentenceToken sentence {@link Token} * @return {@link List} of {@link SegToken} Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java (working copy) @@ -15,47 +15,43 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; /** - * A {@link Tokenizer} that breaks sentences into words. + * A {@link TokenFilter} that breaks sentences into words. */ -public class WordTokenizer extends Tokenizer { +public class WordTokenFilter extends TokenFilter { private WordSegmenter wordSegmenter; - private TokenStream in; - private Iterator tokenIter; private List tokenBuffer; - private Token sentenceToken = new Token(); - /** * Construct a new WordTokenizer. * - * @param in {@link TokenStream} of sentences - * @param wordSegmenter {@link WordSegmenter} to break sentences into words + * @param in {@link TokenStream} of sentences */ - public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) { - this.in = in; - this.wordSegmenter = wordSegmenter; + public WordTokenFilter(TokenStream in) { + super(in); + this.wordSegmenter = new WordSegmenter(); } - public Token next() throws IOException { + public Token next(final Token reusableSentenceToken) throws IOException { if (tokenIter != null && tokenIter.hasNext()) return (Token) tokenIter.next(); else { - if (processNextSentence()) { + Token nextToken = input.next(reusableSentenceToken); + if (processNextSentence(nextToken)) { return (Token) tokenIter.next(); } else return null; @@ -65,20 +61,15 @@ /** * Process the next input sentence, placing tokens into tokenBuffer * + * @param reusableSentenceToken input sentence * @return true if more tokens were placed into tokenBuffer. * @throws IOException */ - private boolean processNextSentence() throws IOException { - sentenceToken = in.next(sentenceToken); - if (sentenceToken == null) + private boolean processNextSentence(final Token reusableSentenceToken) throws IOException { + if (reusableSentenceToken == null) return false; - tokenBuffer = wordSegmenter.segmentSentence(sentenceToken); + tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken); tokenIter = tokenBuffer.iterator(); return tokenBuffer != null && tokenIter.hasNext(); } - - public void close() throws IOException { - in.close(); - } - } Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java (working copy) @@ -15,12 +15,12 @@ * limitations under the License. */ -package org.apache.lucene.analysis.cn.smart; +package org.apache.lucene.analysis.cn; /** * Internal SmartChineseAnalyzer token type constants */ -public class WordType { +final class WordType { /** * Start of a Sentence Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html =================================================================== --- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0) +++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0) @@ -0,0 +1,24 @@ + + + + + +Analyzer for Simplified Chinese, which indexes words. +

+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. +

    +
  • ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens. +
+ +Example phrase: "我是中国人" +
    +
  1. ChineseAnalyzer: 我-是-中-国-人
  2. +
  3. CJKAnalyzer: 我是-是中-中国-国人
  4. +
  5. SmartChineseAnalyzer: 我-是-中国-人
  6. +
+

+ + + \ No newline at end of file Index: contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt =================================================================== --- contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (revision 0) +++ contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (working copy) @@ -51,7 +51,8 @@ [ ] ● - //IDEOGRAPHIC SPACE character (Used as a space in Chinese) +// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese) +  //////////////// English Stop Words //////////////// Index: contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 0) +++ contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -40,6 +40,28 @@ } /* + * This test is the same as the above, except with two phrases. + * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly. + */ + public void testChineseStopWordsDefaultTwoPhrases() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = "我购买了道具和服装。 我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" }; + assertAnalyzesTo(ca, sentence, result); + } + + /* + * This test is the same as the above, except using an ideographic space as a separator. + * This tests to ensure the stopwords are working correctly. + */ + public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = "我购买了道具和服装 我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" }; + assertAnalyzesTo(ca, sentence, result); + } + + /* * Punctuation is handled in a strange way if you disable stopwords * In this example the IDEOGRAPHIC FULL STOP is converted into a comma. * if you don't supply (true) to the constructor, or use a different stopwords list,