Index: build.xml =================================================================== --- build.xml (revision 796589) +++ build.xml (working copy) @@ -306,7 +306,8 @@ - + + Index: contrib/analyzers/build.xml =================================================================== --- contrib/analyzers/build.xml (revision 796589) +++ contrib/analyzers/build.xml (working copy) @@ -21,10 +21,52 @@ Additional Analyzers + - common: Additional Analyzers + - smartcn: Smart Analyzer for Simplified Chinese Text - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: contrib/analyzers/common/build.xml =================================================================== --- contrib/analyzers/common/build.xml (revision 0) +++ contrib/analyzers/common/build.xml (working copy) @@ -25,6 +25,17 @@ + + + + - + + + + + + + + Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/package.html =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/package.html (revision 796589) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/package.html (working copy) @@ -1,5 +1,24 @@ - + + + + -Analyzer for Chinese, Japanese and Korean. +Analyzer for Chinese, Japanese, and Korean, which indexes bigrams (overlapping groups of two adjacent Han characters). +

+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. +

    +
  • ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens. +
+ +Example phrase: "我是中国人" +
    +
  1. ChineseAnalyzer: 我-是-中-国-人
  2. +
  3. CJKAnalyzer: 我是-是中-中国-国人
  4. +
  5. SmartChineseAnalyzer: 我-是-中国-人
  6. +
+

+ - + \ No newline at end of file Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html (revision 796589) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/package.html (working copy) @@ -1,13 +1,15 @@ - + + + -Analyzers for Chinese. +Analyzer for Chinese, which indexes unigrams (individuals chinese characters).

Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.

    -
  • ChineseAnalyzer: Index unigrams (individual Chinese characters) as a token. -
  • CJKAnalyzer: Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. -
  • SmartChineseAnalyzer: Index words (attempt to segment Chinese text into words) as tokens. +
  • ChineseAnalyzer (in this package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
Example phrase: "我是中国人" Index: contrib/analyzers/smartcn/build.xml =================================================================== --- contrib/analyzers/smartcn/build.xml (revision 0) +++ contrib/analyzers/smartcn/build.xml (revision 0) @@ -0,0 +1,41 @@ + + + + + + + + Smart Chinese Analyzer + + + + + + + + + + + + + + + + + + Index: contrib/analyzers/smartcn/pom.xml.template =================================================================== --- contrib/analyzers/smartcn/pom.xml.template (revision 0) +++ contrib/analyzers/smartcn/pom.xml.template (revision 0) @@ -0,0 +1,35 @@ + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-smartcn + Lucene Smart Chinese Analyzer + @version@ + Smart Chinese Analyzer + jar + Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (revision 0) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (working copy) @@ -17,24 +17,21 @@ package org.apache.lucene.analysis.cn; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; -import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; -import org.apache.lucene.analysis.cn.smart.WordSegmenter; -import org.apache.lucene.analysis.cn.smart.WordTokenizer; +import org.apache.lucene.analysis.cn.smart.WordTokenFilter; -import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; // for javadoc - /** *

* SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text. @@ -47,19 +44,35 @@ *

*

* This analyzer requires a dictionary to provide statistical data. - * To specify the location of the dictionary data, refer to {@link AnalyzerProfile} + * SmartChineseAnalyzer has an included dictionary out-of-box. *

*

* The included dictionary data is from ICTCLAS1.0. * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License! *

+ *

+ * In special circumstances a user may wish to configure SmartChineseAnalyzer with a custom data directory location, containing bigramdict.dct and coredict.dct + *

+ * The following order is used to determine the location of the data directory: + * + *
    + *
  1. System property: -Danalysis.data.dir=/path/to/analysis-data
  2. + *
  3. Relative path: analysis-data
  4. + *
  5. Relative path: lib/analysis-data
  6. + *
  7. Property file: analysis.data.dir property from relative path analysis.properties
  8. + *
  9. Property file: analysis.data.dir property from relative path lib/analysis.properties
  10. + *
+ * + * Example property file: + * + *
+ * analysis.data.dir=D:/path/to/analysis-data/
+ * 
*/ public class SmartChineseAnalyzer extends Analyzer { private Set stopWords = null; - private WordSegmenter wordSegment; - /** * Create a new SmartChineseAnalyzer, using the default stopword list. */ @@ -80,10 +93,15 @@ */ public SmartChineseAnalyzer(boolean useDefaultStopWords) { if (useDefaultStopWords) { - stopWords = loadStopWords(this.getClass().getResourceAsStream( - "stopwords.txt")); + try { + InputStream stream = this.getClass().getResourceAsStream("stopwords.txt"); + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + stopWords = WordlistLoader.getWordSet(reader, "//"); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } } - wordSegment = new WordSegmenter(); } /** @@ -94,16 +112,14 @@ * Note: the set should include punctuation, unless you want to index punctuation! *

* @param stopWords {@link Set} of stopwords to use. - * @see SmartChineseAnalyzer#loadStopWords(InputStream) */ public SmartChineseAnalyzer(Set stopWords) { this.stopWords = stopWords; - wordSegment = new WordSegmenter(); } public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new SentenceTokenizer(reader); - result = new WordTokenizer(result, wordSegment); + result = new WordTokenFilter(result); // result = new LowerCaseFilter(result); // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. // The porter stemming is too strict, this is not a bug, this is a feature:) @@ -113,37 +129,28 @@ } return result; } - - /** - * Utility function to return a {@link Set} of stopwords from a UTF-8 encoded {@link InputStream}. - * The comment "//" can be used in the stopword list. - * - * @param input {@link InputStream} of UTF-8 encoded stopwords - * @return {@link Set} of stopwords. - */ - public static Set loadStopWords(InputStream input) { - /* - * Note: WordListLoader is not used here because this method allows for inline "//" comments. - * WordListLoader will only filter out these comments if they are on a separate line. - */ - String line; - Set stopWords = new HashSet(); - try { - BufferedReader br = new BufferedReader(new InputStreamReader(input, - "UTF-8")); - while ((line = br.readLine()) != null) { - if (line.indexOf("//") != -1) { - line = line.substring(0, line.indexOf("//")); - } - line = line.trim(); - if (line.length() != 0) - stopWords.add(line.toLowerCase()); + + private static final class SavedStreams { + Tokenizer tokenStream; + TokenStream filteredTokenStream; + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new SentenceTokenizer(reader); + streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream); + streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream); + if (stopWords != null) { + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false); } - br.close(); - } catch (IOException e) { - System.err.println("WARNING: cannot open stop words list!"); + } else { + streams.tokenStream.reset(reader); } - return stopWords; + + return streams.filteredTokenStream; } - } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/package.html =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/package.html (revision 0) @@ -0,0 +1,24 @@ + + + + + +Analyzer for Simplified Chinese, which indexes words. +

+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. +

    +
  • ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token. +
  • CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. +
  • SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens. +
+ +Example phrase: "我是中国人" +
    +
  1. ChineseAnalyzer: 我-是-中-国-人
  2. +
  3. CJKAnalyzer: 我是-是中-中国-国人
  4. +
  5. SmartChineseAnalyzer: 我-是-中国-人
  6. +
+

+ + + Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (working copy) @@ -23,7 +23,7 @@ import java.util.Properties; /** - * Configure analysis data for SmartChineseAnalyzer + * Manages analysis data configuration for SmartChineseAnalyzer *

* SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box. *

Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (working copy) @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.cn.smart; -import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; @@ -25,35 +24,33 @@ import org.apache.lucene.analysis.Tokenizer; /** - * Tokenizes input into sentences. + * Tokenizes input text into sentences. + *

+ * The output tokens can then be broken into words with {@link WordTokenFilter} + *

*/ public class SentenceTokenizer extends Tokenizer { /** * End of sentence punctuation: 。,!?;,!?; */ - public final static String PUNCTION = "。,!?;,!?;"; + private final static String PUNCTION = "。,!?;,!?;"; private StringBuffer buffer = new StringBuffer(); - private BufferedReader bufferInput; - private int tokenStart = 0, tokenEnd = 0; - private Token t = new Token(); - public SentenceTokenizer(Reader reader) { super(reader); - bufferInput = new BufferedReader(reader, 2048); } - public Token next() throws IOException { + public Token next(final Token reusableToken) throws IOException { buffer.setLength(0); int ci; char ch, pch; boolean atBegin = true; tokenStart = tokenEnd; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; while (true) { @@ -67,14 +64,14 @@ } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) { tokenStart++; tokenEnd++; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; } else { buffer.append(ch); atBegin = false; tokenEnd++; pch = ch; - ci = bufferInput.read(); + ci = input.read(); ch = (char) ci; // Two spaces, such as CR, LF if (Utility.SPACES.indexOf(ch) != -1 @@ -88,14 +85,10 @@ if (buffer.length() == 0) return null; else { - t.clear(); - t.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); - return t; + reusableToken.clear(); + reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); + return reusableToken; } } - public void close() throws IOException { - bufferInput.close(); - } - } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (working copy) @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.cn.smart; -import org.apache.lucene.analysis.cn.smart.hhmm.BiSegGraph; // for javadoc import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc /** @@ -47,7 +46,7 @@ public static final String SPACES = "  \t\r\n"; /** - * Maximum bigram frequency (used in the {@link BiSegGraph} smoothing function). + * Maximum bigram frequency (used in the smoothing function). */ public static final int MAX_FREQUENCE = 2079997 + 80000; Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (working copy) @@ -28,7 +28,7 @@ /** * Segment a sentence of Chinese text into words. */ -public class WordSegmenter { +class WordSegmenter { private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter(); Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java (working copy) @@ -22,40 +22,36 @@ import java.util.List; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; /** - * A {@link Tokenizer} that breaks sentences into words. + * A {@link TokenFilter} that breaks sentences into words. */ -public class WordTokenizer extends Tokenizer { +public class WordTokenFilter extends TokenFilter { private WordSegmenter wordSegmenter; - private TokenStream in; - private Iterator tokenIter; private List tokenBuffer; - private Token sentenceToken = new Token(); - /** * Construct a new WordTokenizer. * - * @param in {@link TokenStream} of sentences - * @param wordSegmenter {@link WordSegmenter} to break sentences into words + * @param in {@link TokenStream} of sentences */ - public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) { - this.in = in; - this.wordSegmenter = wordSegmenter; + public WordTokenFilter(TokenStream in) { + super(in); + this.wordSegmenter = new WordSegmenter(); } - public Token next() throws IOException { + public Token next(final Token reusableSentenceToken) throws IOException { if (tokenIter != null && tokenIter.hasNext()) return (Token) tokenIter.next(); else { - if (processNextSentence()) { + Token nextToken = input.next(reusableSentenceToken); + if (processNextSentence(nextToken)) { return (Token) tokenIter.next(); } else return null; @@ -65,20 +61,15 @@ /** * Process the next input sentence, placing tokens into tokenBuffer * + * @param reusableSentenceToken input sentence * @return true if more tokens were placed into tokenBuffer. * @throws IOException */ - private boolean processNextSentence() throws IOException { - sentenceToken = in.next(sentenceToken); - if (sentenceToken == null) + private boolean processNextSentence(final Token reusableSentenceToken) throws IOException { + if (reusableSentenceToken == null) return false; - tokenBuffer = wordSegmenter.segmentSentence(sentenceToken); + tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken); tokenIter = tokenBuffer.iterator(); return tokenBuffer != null && tokenIter.hasNext(); } - - public void close() throws IOException { - in.close(); - } - } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html (revision 0) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/package.html (revision 0) @@ -0,0 +1,5 @@ + + +SmartChineseAnalyzer Tokenizers and TokenFilters + + \ No newline at end of file Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (working copy) @@ -27,7 +27,7 @@ * Contains methods for dealing with GB2312 encoding. *

*/ -public abstract class AbstractDictionary { +abstract class AbstractDictionary { /** * First Chinese Character in GB2312 (15 * 94) * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (working copy) @@ -32,7 +32,7 @@ * For each start offset, a list of possible token pairs is stored. *

*/ -public class BiSegGraph { +class BiSegGraph { private Map tokenPairListTable = new HashMap(); Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (working copy) @@ -35,7 +35,7 @@ /** * SmartChineseAnalyzer Bigram dictionary. */ -public class BigramDictionary extends AbstractDictionary { +class BigramDictionary extends AbstractDictionary { private BigramDictionary() { } @@ -208,45 +208,6 @@ // log.info("load dictionary done! " + dctFilePath + " total:" + total); } - /* - * public void test(String dctFilePath) throws IOException { int i, cnt, - * length, total = 0; int corrupt = 0, notFound = 0; // - * 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3]; - * byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile = - * new RandomAccessFile(dctFilePath, "r"); - * - * // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i < - * GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr = - * getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i); - * - * dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big - * endian,必须转换过来 cnt = - * ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if - * (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) { - * dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order( - * ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer); - * buffer[1] = ByteBuffer.wrap(intBuffer).order( - * ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); // - * buffer[2] = ByteBuffer.wrap(intBuffer).order( // - * ByteOrder.LITTLE_ENDIAN).getInt();// handle - * - * length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length]; - * dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i - * != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char - * carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray); - * if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { // - * System.out.println("corrupt: " + tmpword + "<->" // + - * bigramStringTable[index]); // corrupt++; // } } else { - * System.out.println("not found: " + tmpword); notFound++; } } j++; } } - * dctFile.close(); System.out.println("num not found:" + notFound); - * System.out.println("num corrupt:" + corrupt); - * - * log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt = - * 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j] - * != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt); - * } - */ - private int getAvaliableIndex(long hashId, char carray[]) { int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH); int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH; @@ -307,13 +268,4 @@ return 0; } - public static void main(String[] args) throws FileNotFoundException, - UnsupportedEncodingException, IOException { - BigramDictionary dic = new BigramDictionary(); - dic.load("D:/analysis-data"); - // dic.test("D:/analysis-data/BigramDict.dct"); - System.out.println("max:" + dic.max); - System.out.println("average repeat:" + (double) dic.repeat / 328856); - System.out.println("end"); - } } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (working copy) @@ -23,7 +23,7 @@ * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm. *

*/ -public class PathNode implements Comparable { +class PathNode implements Comparable { public double weight; public int preNode; Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (working copy) @@ -29,7 +29,7 @@ * For each start offset, a list of possible tokens is stored. *

*/ -public class SegGraph { +class SegGraph { /** * Map of start offsets to ArrayList of tokens at that position Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (working copy) @@ -17,30 +17,64 @@ package org.apache.lucene.analysis.cn.smart.hhmm; +import org.apache.lucene.analysis.cn.smart.WordType; // for javadocs + /** * SmartChineseAnalyzer internal token */ public class SegToken { + /** + * Character array containing token text + */ public char[] charArray; + /** + * start offset into {@link #charArray} + */ public int startOffset; + /** + * end offset into {@link #charArray} + */ public int endOffset; + /** + * {@link WordType} of the text + */ public int wordType; + /** + * word frequency + */ public int weight; + /** + * during segmentation, this is used to store the index of the token in the token list table + */ public int index; + /** + * Create a new SegToken from a {@link String} + * + * @param word String containing text + * @param start start offset into word + * @param end end offset of word + * @param wordType {@link WordType} of the text + * @param weight word frequency + */ public SegToken(String word, int start, int end, int wordType, int weight) { - this.charArray = word.toCharArray(); - this.startOffset = start; - this.endOffset = end; - this.wordType = wordType; - this.weight = weight; + this(word.toCharArray(), start, end, wordType, weight); } + /** + * Create a new SegToken from a character array. + * + * @param idArray character array containing text + * @param start start offset into idArray + * @param end end offset of idArray + * @param wordType {@link WordType} of the text + * @param weight word frequency + */ public SegToken(char[] idArray, int start, int end, int wordType, int weight) { this.charArray = idArray; this.startOffset = start; @@ -49,13 +83,4 @@ this.weight = weight; } - // public String toString() { - // return String.valueOf(charArray) + "/s(" + startOffset + ")e(" - // + endOffset + ")/w(" + weight + ")t(" + wordType + ")"; - // } - - // public boolean equals(RawToken t) { - // return this.startOffset == t.startOffset - // && this.endOffset == t.endOffset; - // } } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (working copy) @@ -20,7 +20,7 @@ /** * A pair of tokens in {@link SegGraph} */ -public class SegTokenPair { +class SegTokenPair { public char[] charArray; @@ -43,13 +43,4 @@ this.weight = weight; } - // public String toString() { - // return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):" - // + weight; - // } - - // public boolean equals(SegTokenPair tp) { - // return this.from == tp.from && this.to == tp.to; - // } - } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (revision 796589) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (working copy) @@ -37,7 +37,7 @@ * SmartChineseAnalyzer Word Dictionary * */ -public class WordDictionary extends AbstractDictionary { +class WordDictionary extends AbstractDictionary { private WordDictionary() { } @@ -550,18 +550,4 @@ wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; } - public static void main(String[] args) throws FileNotFoundException, - IOException { - WordDictionary dic = new WordDictionary(); - dic.load("D:/analysis-data"); - Utility.getCharType('。'); - Utility.getCharType('汗'); - Utility.getCharType(' ');// 0020 - Utility.getCharType(' ');// 3000 - Utility.getCharType('');// E095 - Utility.getCharType(' ');// 3000 - Utility.getCharType('\r');// 000D - Utility.getCharType('\n');// 000A - Utility.getCharType('\t');// 0009 - } } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html (revision 0) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/package.html (revision 0) @@ -0,0 +1,5 @@ + + +SmartChineseAnalyzer Hidden Markov Model package + + \ No newline at end of file Index: contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt =================================================================== --- contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (revision 796589) +++ contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (working copy) @@ -51,7 +51,8 @@ [ ] ● - //IDEOGRAPHIC SPACE character (Used as a space in Chinese) +// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese) +  //////////////// English Stop Words //////////////// Index: contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java =================================================================== --- contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (revision 0) +++ contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java (working copy) @@ -40,6 +40,28 @@ } /* + * This test is the same as the above, except with two phrases. + * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly. + */ + public void testChineseStopWordsDefaultTwoPhrases() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = "我购买了道具和服装。 我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" }; + assertAnalyzesTo(ca, sentence, result); + } + + /* + * This test is the same as the above, except using an ideographic space as a separator. + * This tests to ensure the stopwords are working correctly. + */ + public void testChineseStopWordsDefaultTwoPhrasesIdeoSpache() throws Exception { + Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */ + String sentence = "我购买了道具和服装 我购买了道具和服装。"; + String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" }; + assertAnalyzesTo(ca, sentence, result); + } + + /* * Punctuation is handled in a strange way if you disable stopwords * In this example the IDEOGRAPHIC FULL STOP is converted into a comma. * if you don't supply (true) to the constructor, or use a different stopwords list, Index: docs/benchmarks.html =================================================================== --- docs/benchmarks.html (revision 796589) +++ docs/benchmarks.html (working copy) @@ -127,6 +127,9 @@ Analyzers + + + + + + + + + + + + +