Index: contrib/analysis/build.xml
===================================================================
--- contrib/analysis/build.xml (revision 0)
+++ contrib/analysis/build.xml (revision 0)
@@ -0,0 +1,72 @@
+
+
+
+
+
+
+
+ Additional Analyzers
+ - analyzers: Additional Analyzers
+ - smartcn: Smart Analyzer for Simplified Chinese Text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: contrib/analysis/analyzers/build.xml
===================================================================
--- contrib/analysis/analyzers/build.xml (revision 791341)
+++ contrib/analysis/analyzers/build.xml (working copy)
@@ -25,6 +25,17 @@
+
+
+
+
-
+
+
+
+
+
+
+
+
Index: contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html
===================================================================
--- contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html (revision 791341)
+++ contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cjk/package.html (working copy)
@@ -1,5 +1,24 @@
-
+
+
+
+
-Analyzer for Chinese, Japanese and Korean.
+Analyzer for Chinese, Japanese, and Korean, which indexes bigrams (overlapping groups of two adjacent Han characters).
+
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+
+ - ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
+
- CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+
- SmartChineseAnalyzer (in the analyzer-smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
+
+
+Example phrase: "我是中国人"
+
+ - ChineseAnalyzer: 我-是-中-国-人
+ - CJKAnalyzer: 我是-是中-中国-国人
+ - SmartChineseAnalyzer: 我-是-中国-人
+
+
+
Index: contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
===================================================================
--- contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (revision 791341)
+++ contrib/analysis/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (working copy)
@@ -1,13 +1,15 @@
-
+
+
+
-Analyzers for Chinese.
+Analyzer for Chinese, which indexes unigrams (individuals chinese characters).
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
- - ChineseAnalyzer: Index unigrams (individual Chinese characters) as a token.
-
- CJKAnalyzer: Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
-
- SmartChineseAnalyzer: Index words (attempt to segment Chinese text into words) as tokens.
+
- ChineseAnalyzer (in this package): Index unigrams (individual Chinese characters) as a token.
+
- CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+
- SmartChineseAnalyzer (in the analyzer-smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
Example phrase: "我是中国人"
Index: contrib/analysis/smartcn/build.xml
===================================================================
--- contrib/analysis/smartcn/build.xml (revision 0)
+++ contrib/analysis/smartcn/build.xml (revision 0)
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+ Smart Chinese Analyzer
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: contrib/analysis/smartcn/pom.xml.template
===================================================================
--- contrib/analysis/smartcn/pom.xml.template (revision 0)
+++ contrib/analysis/smartcn/pom.xml.template (revision 0)
@@ -0,0 +1,35 @@
+
+
+
+ 4.0.0
+
+ org.apache.lucene
+ lucene-contrib
+ @version@
+
+ org.apache.lucene
+ lucene-analysis-smartcn
+ Lucene Smart Chinese Analyzer
+ @version@
+ Smart Chinese Analyzer
+ jar
+
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AbstractDictionary.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.io.UnsupportedEncodingException;
@@ -27,7 +27,7 @@
* Contains methods for dealing with GB2312 encoding.
*
*/
-public abstract class AbstractDictionary {
+abstract class AbstractDictionary {
/**
* First Chinese Character in GB2312 (15 * 94)
* Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/AnalyzerProfile.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
import java.io.File;
import java.io.FileInputStream;
@@ -49,12 +49,12 @@
*
*
*/
-public class AnalyzerProfile {
+class AnalyzerProfile {
/**
* Global indicating the configured analysis data directory
*/
- public static String ANALYSIS_DATA_DIR = "";
+ static String ANALYSIS_DATA_DIR = "";
static {
init();
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BiSegGraph.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.util.ArrayList;
import java.util.Collection;
@@ -24,7 +24,6 @@
import java.util.List;
import java.util.Map;
-import org.apache.lucene.analysis.cn.smart.Utility;
/**
* Graph representing possible token pairs (bigrams) at each start offset in the sentence.
@@ -32,7 +31,7 @@
* For each start offset, a list of possible token pairs is stored.
*
*/
-public class BiSegGraph {
+class BiSegGraph {
private Map tokenPairListTable = new HashMap();
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/BigramDictionary.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.io.File;
import java.io.FileInputStream;
@@ -30,12 +30,11 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
-import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
/**
* SmartChineseAnalyzer Bigram dictionary.
*/
-public class BigramDictionary extends AbstractDictionary {
+class BigramDictionary extends AbstractDictionary {
private BigramDictionary() {
}
@@ -208,45 +207,6 @@
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
- /*
- * public void test(String dctFilePath) throws IOException { int i, cnt,
- * length, total = 0; int corrupt = 0, notFound = 0; //
- * 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3];
- * byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
- * new RandomAccessFile(dctFilePath, "r");
- *
- * // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
- * GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
- * getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
- *
- * dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big
- * endian,必须转换过来 cnt =
- * ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
- * (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
- * dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
- * ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
- * buffer[1] = ByteBuffer.wrap(intBuffer).order(
- * ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
- * buffer[2] = ByteBuffer.wrap(intBuffer).order( //
- * ByteOrder.LITTLE_ENDIAN).getInt();// handle
- *
- * length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
- * dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
- * != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
- * carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
- * if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
- * System.out.println("corrupt: " + tmpword + "<->" // +
- * bigramStringTable[index]); // corrupt++; // } } else {
- * System.out.println("not found: " + tmpword); notFound++; } } j++; } }
- * dctFile.close(); System.out.println("num not found:" + notFound);
- * System.out.println("num corrupt:" + corrupt);
- *
- * log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
- * 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
- * != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
- * }
- */
-
private int getAvaliableIndex(long hashId, char carray[]) {
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -307,13 +267,4 @@
return 0;
}
- public static void main(String[] args) throws FileNotFoundException,
- UnsupportedEncodingException, IOException {
- BigramDictionary dic = new BigramDictionary();
- dic.load("D:/analysis-data");
- // dic.test("D:/analysis-data/BigramDict.dct");
- System.out.println("max:" + dic.max);
- System.out.println("average repeat:" + (double) dic.repeat / 328856);
- System.out.println("end");
- }
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/CharType.java (working copy)
@@ -15,12 +15,12 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
/**
* Internal SmartChineseAnalyzer character type constants.
*/
-public class CharType {
+final class CharType {
/**
* Punctuation Characters
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/HHMMSegmenter.java (working copy)
@@ -15,18 +15,14 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.util.List;
-import org.apache.lucene.analysis.cn.smart.CharType;
-import org.apache.lucene.analysis.cn.smart.Utility;
-import org.apache.lucene.analysis.cn.smart.WordType;
-
/**
* Finds the optimal segmentation of a sentence into Chinese words
*/
-public class HHMMSegmenter {
+class HHMMSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/PathNode.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
/**
* SmartChineseAnalyzer internal node representation
@@ -23,7 +23,7 @@
* Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
*
*/
-public class PathNode implements Comparable {
+class PathNode implements Comparable {
public double weight;
public int preNode;
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegGraph.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.util.ArrayList;
import java.util.HashMap;
@@ -29,7 +29,7 @@
* For each start offset, a list of possible tokens is stored.
*
*/
-public class SegGraph {
+class SegGraph {
/**
* Map of start offsets to ArrayList of tokens at that position
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegToken.java (working copy)
@@ -15,32 +15,46 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
/**
- * SmartChineseAnalyzer internal token
+ * SmartChineseAnalyzer internal token.
*/
-public class SegToken {
- public char[] charArray;
+class SegToken {
+ char[] charArray;
- public int startOffset;
+ int startOffset;
- public int endOffset;
+ int endOffset;
- public int wordType;
+ int wordType;
- public int weight;
+ int weight;
- public int index;
+ int index;
+ /**
+ * Create a new SegToken from a {@link String}
+ *
+ * @param word
+ * @param start start offset of text
+ * @param end end offset of text
+ * @param wordType {@link WordType} of the text
+ * @param weight word frequency
+ */
public SegToken(String word, int start, int end, int wordType, int weight) {
- this.charArray = word.toCharArray();
- this.startOffset = start;
- this.endOffset = end;
- this.wordType = wordType;
- this.weight = weight;
+ this(word.toCharArray(), start, end, wordType, weight);
}
+ /**
+ * Create a new SegToken from a character array.
+ *
+ * @param idArray character array containing text
+ * @param start start offset of text
+ * @param end end offset of text
+ * @param wordType {@link WordType} of the text
+ * @param weight word frequency
+ */
public SegToken(char[] idArray, int start, int end, int wordType, int weight) {
this.charArray = idArray;
this.startOffset = start;
@@ -48,14 +62,4 @@
this.wordType = wordType;
this.weight = weight;
}
-
- // public String toString() {
- // return String.valueOf(charArray) + "/s(" + startOffset + ")e("
- // + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
- // }
-
- // public boolean equals(RawToken t) {
- // return this.startOffset == t.startOffset
- // && this.endOffset == t.endOffset;
- // }
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenFilter.java (working copy)
@@ -15,10 +15,8 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
-import org.apache.lucene.analysis.cn.smart.Utility;
-import org.apache.lucene.analysis.cn.smart.WordType;
/**
*
@@ -26,7 +24,7 @@
* Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
*
*/
-public class SegTokenFilter {
+class SegTokenFilter {
/**
* Filter an input {@link SegToken}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SegTokenPair.java (working copy)
@@ -15,12 +15,12 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
/**
* A pair of tokens in {@link SegGraph}
*/
-public class SegTokenPair {
+class SegTokenPair {
public char[] charArray;
@@ -43,13 +43,4 @@
this.weight = weight;
}
- // public String toString() {
- // return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):"
- // + weight;
- // }
-
- // public boolean equals(SegTokenPair tp) {
- // return this.from == tp.from && this.to == tp.to;
- // }
-
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SentenceTokenizer.java (working copy)
@@ -15,9 +15,8 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
-import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
@@ -25,35 +24,33 @@
import org.apache.lucene.analysis.Tokenizer;
/**
- * Tokenizes input into sentences.
+ * Tokenizes input text into sentences.
+ *
+ * The output tokens can then be broken into words with {@link WordTokenFilter}
+ *
*/
public class SentenceTokenizer extends Tokenizer {
/**
* End of sentence punctuation: 。,!?;,!?;
*/
- public final static String PUNCTION = "。,!?;,!?;";
+ private final static String PUNCTION = "。,!?;,!?;";
private StringBuffer buffer = new StringBuffer();
- private BufferedReader bufferInput;
-
private int tokenStart = 0, tokenEnd = 0;
- private Token t = new Token();
-
public SentenceTokenizer(Reader reader) {
super(reader);
- bufferInput = new BufferedReader(reader, 2048);
}
- public Token next() throws IOException {
+ public Token next(final Token reusableToken) throws IOException {
buffer.setLength(0);
int ci;
char ch, pch;
boolean atBegin = true;
tokenStart = tokenEnd;
- ci = bufferInput.read();
+ ci = input.read();
ch = (char) ci;
while (true) {
@@ -67,14 +64,14 @@
} else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
tokenStart++;
tokenEnd++;
- ci = bufferInput.read();
+ ci = input.read();
ch = (char) ci;
} else {
buffer.append(ch);
atBegin = false;
tokenEnd++;
pch = ch;
- ci = bufferInput.read();
+ ci = input.read();
ch = (char) ci;
// Two spaces, such as CR, LF
if (Utility.SPACES.indexOf(ch) != -1
@@ -88,14 +85,10 @@
if (buffer.length() == 0)
return null;
else {
- t.clear();
- t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
- return t;
+ reusableToken.clear();
+ reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
+ return reusableToken;
}
}
- public void close() throws IOException {
- bufferInput.close();
- }
-
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (working copy)
@@ -17,24 +17,19 @@
package org.apache.lucene.analysis.cn;
-import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
-import org.apache.lucene.analysis.cn.smart.WordSegmenter;
-import org.apache.lucene.analysis.cn.smart.WordTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
-import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; // for javadoc
-
/**
*
* SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text.
@@ -47,19 +42,35 @@
*
*
* This analyzer requires a dictionary to provide statistical data.
- * To specify the location of the dictionary data, refer to {@link AnalyzerProfile}
+ * SmartChineseAnalyzer has an included dictionary out-of-box.
*
*
* The included dictionary data is from ICTCLAS1.0.
* Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
*
+ *
+ * In special circumstances a user may wish to configure SmartChineseAnalyzer with a custom data directory location, containing bigramdict.dct and coredict.dct
+ *
+ * The following order is used to determine the location of the data directory:
+ *
+ *
+ * - System property: -Danalysis.data.dir=/path/to/analysis-data
+ * - Relative path: analysis-data
+ * - Relative path: lib/analysis-data
+ * - Property file: analysis.data.dir property from relative path analysis.properties
+ * - Property file: analysis.data.dir property from relative path lib/analysis.properties
+ *
+ *
+ * Example property file:
+ *
+ *
+ * analysis.data.dir=D:/path/to/analysis-data/
+ *
*/
public class SmartChineseAnalyzer extends Analyzer {
private Set stopWords = null;
- private WordSegmenter wordSegment;
-
/**
* Create a new SmartChineseAnalyzer, using the default stopword list.
*/
@@ -80,10 +91,15 @@
*/
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
if (useDefaultStopWords) {
- stopWords = loadStopWords(this.getClass().getResourceAsStream(
- "stopwords.txt"));
+ try {
+ InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
+ InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+ stopWords = WordlistLoader.getWordSet(reader, "//");
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
}
- wordSegment = new WordSegmenter();
}
/**
@@ -94,16 +110,14 @@
* Note: the set should include punctuation, unless you want to index punctuation!
*
* @param stopWords {@link Set} of stopwords to use.
- * @see SmartChineseAnalyzer#loadStopWords(InputStream)
*/
public SmartChineseAnalyzer(Set stopWords) {
this.stopWords = stopWords;
- wordSegment = new WordSegmenter();
}
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new SentenceTokenizer(reader);
- result = new WordTokenizer(result, wordSegment);
+ result = new WordTokenFilter(result);
// result = new LowerCaseFilter(result);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
@@ -113,37 +127,28 @@
}
return result;
}
-
- /**
- * Utility function to return a {@link Set} of stopwords from a UTF-8 encoded {@link InputStream}.
- * The comment "//" can be used in the stopword list.
- *
- * @param input {@link InputStream} of UTF-8 encoded stopwords
- * @return {@link Set} of stopwords.
- */
- public static Set loadStopWords(InputStream input) {
- /*
- * Note: WordListLoader is not used here because this method allows for inline "//" comments.
- * WordListLoader will only filter out these comments if they are on a separate line.
- */
- String line;
- Set stopWords = new HashSet();
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(input,
- "UTF-8"));
- while ((line = br.readLine()) != null) {
- if (line.indexOf("//") != -1) {
- line = line.substring(0, line.indexOf("//"));
- }
- line = line.trim();
- if (line.length() != 0)
- stopWords.add(line.toLowerCase());
+
+ private static final class SavedStreams {
+ Tokenizer tokenStream;
+ TokenStream filteredTokenStream;
+ }
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader)
+ throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ setPreviousTokenStream(streams);
+ streams.tokenStream = new SentenceTokenizer(reader);
+ streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
+ streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
+ if (stopWords != null) {
+ streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
}
- br.close();
- } catch (IOException e) {
- System.err.println("WARNING: cannot open stop words list!");
+ } else {
+ streams.tokenStream.reset(reader);
}
- return stopWords;
+
+ return streams.filteredTokenStream;
}
-
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/Utility.java (working copy)
@@ -15,15 +15,13 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
-import org.apache.lucene.analysis.cn.smart.hhmm.BiSegGraph; // for javadoc
-import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
/**
* SmartChineseAnalyzer utility constants and methods
*/
-public class Utility {
+class Utility {
public static final char[] STRING_CHAR_ARRAY = new String("未##串")
.toCharArray();
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordDictionary.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart.hhmm;
+package org.apache.lucene.analysis.cn;
import java.io.File;
import java.io.FileInputStream;
@@ -30,14 +30,11 @@
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
-import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
-import org.apache.lucene.analysis.cn.smart.Utility;
-
/**
* SmartChineseAnalyzer Word Dictionary
*
*/
-public class WordDictionary extends AbstractDictionary {
+class WordDictionary extends AbstractDictionary {
private WordDictionary() {
}
@@ -550,18 +547,4 @@
wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
}
- public static void main(String[] args) throws FileNotFoundException,
- IOException {
- WordDictionary dic = new WordDictionary();
- dic.load("D:/analysis-data");
- Utility.getCharType('。');
- Utility.getCharType('汗');
- Utility.getCharType(' ');// 0020
- Utility.getCharType(' ');// 3000
- Utility.getCharType('');// E095
- Utility.getCharType(' ');// 3000
- Utility.getCharType('\r');// 000D
- Utility.getCharType('\n');// 000A
- Utility.getCharType('\t');// 0009
- }
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordSegmenter.java (working copy)
@@ -15,27 +15,24 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
/**
* Segment a sentence of Chinese text into words.
*/
-public class WordSegmenter {
+class WordSegmenter {
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
private SegTokenFilter tokenFilter = new SegTokenFilter();
/**
- * Segment a sentence into words with {@link HHMMSegmenter}
+ * Segment a sentence of Chinese text into words
*
* @param sentenceToken sentence {@link Token}
* @return {@link List} of {@link SegToken}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordTokenFilter.java (working copy)
@@ -15,47 +15,43 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
/**
- * A {@link Tokenizer} that breaks sentences into words.
+ * A {@link TokenFilter} that breaks sentences into words.
*/
-public class WordTokenizer extends Tokenizer {
+public class WordTokenFilter extends TokenFilter {
private WordSegmenter wordSegmenter;
- private TokenStream in;
-
private Iterator tokenIter;
private List tokenBuffer;
- private Token sentenceToken = new Token();
-
/**
* Construct a new WordTokenizer.
*
- * @param in {@link TokenStream} of sentences
- * @param wordSegmenter {@link WordSegmenter} to break sentences into words
+ * @param in {@link TokenStream} of sentences
*/
- public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
- this.in = in;
- this.wordSegmenter = wordSegmenter;
+ public WordTokenFilter(TokenStream in) {
+ super(in);
+ this.wordSegmenter = new WordSegmenter();
}
- public Token next() throws IOException {
+ public Token next(final Token reusableSentenceToken) throws IOException {
if (tokenIter != null && tokenIter.hasNext())
return (Token) tokenIter.next();
else {
- if (processNextSentence()) {
+ Token nextToken = input.next(reusableSentenceToken);
+ if (processNextSentence(nextToken)) {
return (Token) tokenIter.next();
} else
return null;
@@ -65,20 +61,15 @@
/**
* Process the next input sentence, placing tokens into tokenBuffer
*
+ * @param reusableSentenceToken input sentence
* @return true if more tokens were placed into tokenBuffer.
* @throws IOException
*/
- private boolean processNextSentence() throws IOException {
- sentenceToken = in.next(sentenceToken);
- if (sentenceToken == null)
+ private boolean processNextSentence(final Token reusableSentenceToken) throws IOException {
+ if (reusableSentenceToken == null)
return false;
- tokenBuffer = wordSegmenter.segmentSentence(sentenceToken);
+ tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken);
tokenIter = tokenBuffer.iterator();
return tokenBuffer != null && tokenIter.hasNext();
}
-
- public void close() throws IOException {
- in.close();
- }
-
}
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/WordType.java (working copy)
@@ -15,12 +15,12 @@
* limitations under the License.
*/
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn;
/**
* Internal SmartChineseAnalyzer token type constants
*/
-public class WordType {
+final class WordType {
/**
* Start of a Sentence
Index: contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html
===================================================================
--- contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0)
+++ contrib/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0)
@@ -0,0 +1,24 @@
+
+
+
+
+
+Analyzer for Simplified Chinese, which indexes words.
+
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+
+ - ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
+
- CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+
- SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
+
+
+Example phrase: "我是中国人"
+
+ - ChineseAnalyzer: 我-是-中-国-人
+ - CJKAnalyzer: 我是-是中-中国-国人
+ - SmartChineseAnalyzer: 我-是-中国-人
+
+
+
+
+
\ No newline at end of file
Index: contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
===================================================================
--- contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (revision 0)
+++ contrib/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (working copy)
@@ -51,7 +51,8 @@
[
]
●
- //IDEOGRAPHIC SPACE character (Used as a space in Chinese)
+// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
+
//////////////// English Stop Words ////////////////
Index: contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
===================================================================
--- contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 0)
+++ contrib/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy)
@@ -40,6 +40,28 @@
}
/*
+ * This test is the same as the above, except with two phrases.
+ * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly.
+ */
+ public void testChineseStopWordsDefaultTwoPhrases() throws Exception {
+ Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+ String sentence = "我购买了道具和服装。 我购买了道具和服装。";
+ String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
+ assertAnalyzesTo(ca, sentence, result);
+ }
+
+ /*
+ * This test is the same as the above, except using an ideographic space as a separator.
+ * This tests to ensure the stopwords are working correctly.
+ */
+ public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception {
+ Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+ String sentence = "我购买了道具和服装 我购买了道具和服装。";
+ String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
+ assertAnalyzesTo(ca, sentence, result);
+ }
+
+ /*
* Punctuation is handled in a strange way if you disable stopwords
* In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
* if you don't supply (true) to the constructor, or use a different stopwords list,