Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java	(working copy)
@@ -25,7 +25,7 @@
 /**
  * Manages analysis data configuration for SmartChineseAnalyzer
  * <p>
- * SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
+ * SmartChineseAnalyzer has a built-in dictionary out-of-box.
  * </p>
  * <p><font color="#FF0000">
  * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java	(working copy)
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart;
-
-/**
- * Internal SmartChineseAnalyzer character type constants.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-public class CharType {
-
-  /**
-   * Punctuation Characters
-   */
-  public final static int DELIMITER = 0;
-
-  /**
-   * Letters
-   */
-  public final static int LETTER = 1;
-
-  /**
-   * Numeric Digits
-   */
-  public final static int DIGIT = 2;
-
-  /**
-   * Han Ideographs
-   */
-  public final static int HANZI = 3;
-
-  /**
-   * Characters that act as a space
-   */
-  public final static int SPACE_LIKE = 4;
-
-  /**
-   * Full-Width letters
-   */
-  public final static int FULLWIDTH_LETTER = 5;
-
-  /**
-   * Full-Width alphanumeric characters
-   */
-  public final static int FULLWIDTH_DIGIT = 6;
-
-  /**
-   * Other (not fitting any of the other categories)
-   */
-  public final static int OTHER = 7;
-
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/FullWidthFilter.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/FullWidthFilter.java	(revision 0)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/FullWidthFilter.java	(revision 0)
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.cn.smart;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes fullwidth forms into half-width forms.
+ */
+public class FullWidthFilter extends TokenFilter {
+  private TermAttribute termAtt;
+  
+  public FullWidthFilter(TokenStream input) {
+    super(input);
+    termAtt = addAttribute(TermAttribute.class);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char text[] = termAtt.termBuffer();
+      int length = termAtt.termLength();
+      for (int i = 0; i < length; i++) {
+        if (text[i] >= 0xFF01 && text[i] <= 0xFF5E)
+          text[i] -= 0xFEE0;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Property changes on: contrib\analyzers\smartcn\src\java\org\apache\lucene\analysis\cn\smart\FullWidthFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java	(working copy)
@@ -39,11 +39,6 @@
  */
 public final class SentenceTokenizer extends Tokenizer {
 
-  /**
-   * End of sentence punctuation: 。，！？；,!?;
-   */
-  private final static String PUNCTION = "。，！？；,!?;";
-
   private final StringBuilder buffer = new StringBuilder();
 
   private int tokenStart = 0, tokenEnd = 0;
@@ -86,12 +81,12 @@
     while (true) {
       if (ci == -1) {
         break;
-      } else if (PUNCTION.indexOf(ch) != -1) {
+      } else if (isChinesePunctuation(ch)) {
         // End of a sentence
         buffer.append(ch);
         tokenEnd++;
         break;
-      } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
+      } else if (atBegin && isChineseSpace(ch)) {
         tokenStart++;
         tokenEnd++;
         ci = input.read();
@@ -104,8 +99,7 @@
         ci = input.read();
         ch = (char) ci;
         // Two spaces, such as CR, LF
-        if (Utility.SPACES.indexOf(ch) != -1
-            && Utility.SPACES.indexOf(pch) != -1) {
+        if (isChineseSpace(ch) && isChineseSpace(pch)) {
           // buffer.append(ch);
           tokenEnd++;
           break;
@@ -115,7 +109,13 @@
     if (buffer.length() == 0)
       return false;
     else {
-      termAtt.setTermBuffer(buffer.toString());
+      final int length = buffer.length();
+      char termBuffer[] = termAtt.termBuffer();
+      if (termBuffer.length < length)
+        termBuffer = termAtt.resizeTermBuffer(length);
+      
+      buffer.getChars(0, length, termBuffer, 0);
+      termAtt.setTermLength(length);
       offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
       typeAtt.setType("sentence");
       return true;
@@ -131,4 +131,40 @@
     super.reset(input);
     reset();
   }
+  
+  /**
+   * true if <code>ch</code> is any one of 。，！？；,!?
+   */
+  private boolean isChinesePunctuation(int ch) {
+    switch(ch) {
+      case '。':
+      case '，':
+      case '！':
+      case '？':
+      case '；':
+      case ',':
+      case '!':
+      case '?':
+        return true;
+      default:
+        return false;
+    }
+  }
+  
+  /**
+   * true if <code>ch</code> is space, ideographic space, tab, carriage return,
+   * or newline
+   */
+  private boolean isChineseSpace(int ch) {
+    switch(ch) {
+      case ' ':
+      case '　':
+      case '\t':
+      case '\r':
+      case '\n':
+        return true;
+      default:
+        return false;
+    }
+  }
 }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java	(working copy)
@@ -18,20 +18,16 @@
 package org.apache.lucene.analysis.cn.smart;
 
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.Collections;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.PorterStemFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.WordlistLoader;
-import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
-import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
 import org.apache.lucene.util.Version;
 
 /**
@@ -61,85 +57,23 @@
 public class SmartChineseAnalyzer extends Analyzer {
 
   private final Set<?> stopWords;
-  
-  private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
-  
-  private static final String STOPWORD_FILE_COMMENT = "//";
-  
-  /**
-   * Returns an unmodifiable instance of the default stop-words set.
-   * @return an unmodifiable instance of the default stop-words set.
-   */
-  public static Set<String> getDefaultStopSet(){
-    return DefaultSetHolder.DEFAULT_STOP_SET;
-  }
-  
-  /**
-   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
-   * accesses the static final set the first time.;
-   */
-  private static class DefaultSetHolder {
-    static final Set<String> DEFAULT_STOP_SET;
-
-    static {
-      try {
-        DEFAULT_STOP_SET = loadDefaultStopWordSet();
-      } catch (IOException ex) {
-        // default set should always be present as it is part of the
-        // distribution (JAR)
-        throw new RuntimeException("Unable to load default stopword set");
-      }
-    }
-
-    static Set<String> loadDefaultStopWordSet() throws IOException {
-      InputStream stream = SmartChineseAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      try {
-        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-        // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
-      } finally {
-        stream.close();
-      }
-    }
-  }
-
   private final Version matchVersion;
 
   /**
-   * Create a new SmartChineseAnalyzer, using the default stopword list.
+   * Create a new SmartChineseAnalyzer, with no stopwords.
    */
   public SmartChineseAnalyzer(Version matchVersion) {
-    this(matchVersion, true);
+    this(matchVersion, null);
   }
 
   /**
    * <p>
-   * Create a new SmartChineseAnalyzer, optionally using the default stopword list.
-   * </p>
-   * <p>
-   * The included default stopword list is simply a list of punctuation.
-   * If you do not use this list, punctuation will not be removed from the text!
-   * </p>
-   * 
-   * @param useDefaultStopWords true to use the default stopword list.
-   */
-  public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) {
-    stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
-      : Collections.EMPTY_SET;
-    this.matchVersion = matchVersion;
-  }
-
-  /**
-   * <p>
    * Create a new SmartChineseAnalyzer, using the provided {@link Set} of stopwords.
    * </p>
-   * <p>
-   * Note: the set should include punctuation, unless you want to index punctuation!
-   * </p>
    * @param stopWords {@link Set} of stopwords to use.
    */
-  public SmartChineseAnalyzer(Version matchVersion, Set stopWords) {
+  public SmartChineseAnalyzer(Version matchVersion, Set<?> stopWords) {
+    super();
     this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
     this.matchVersion = matchVersion;
   }
@@ -147,9 +81,8 @@
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenFilter(result);
-    // result = new LowerCaseFilter(result);
-    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
-    // The porter stemming is too strict, this is not a bug, this is a feature:)
+    result = new FullWidthFilter(result);
+    result = new LowerCaseFilter(result);
     result = new PorterStemFilter(result);
     if (!stopWords.isEmpty()) {
       result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
@@ -171,6 +104,8 @@
       setPreviousTokenStream(streams);
       streams.tokenStream = new SentenceTokenizer(reader);
       streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
+      streams.filteredTokenStream = new FullWidthFilter(streams.filteredTokenStream);
+      streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
       streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
       if (!stopWords.isEmpty()) {
         streams.filteredTokenStream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java	(working copy)
@@ -1,184 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart;
-
-import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
-
-/**
- * SmartChineseAnalyzer utility constants and methods
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-public class Utility {
-
-  public static final char[] STRING_CHAR_ARRAY = new String("未##串")
-      .toCharArray();
-
-  public static final char[] NUMBER_CHAR_ARRAY = new String("未##数")
-      .toCharArray();
-
-  public static final char[] START_CHAR_ARRAY = new String("始##始")
-      .toCharArray();
-
-  public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
-
-  /**
-   * Delimiters will be filtered to this character by {@link SegTokenFilter}
-   */
-  public static final char[] COMMON_DELIMITER = new char[] { ',' };
-
-  /**
-   * Space-like characters that need to be skipped: such as space, tab, newline, carriage return.
-   */
-  public static final String SPACES = " 　\t\r\n";
-
-  /**
-   * Maximum bigram frequency (used in the smoothing function). 
-   */
-  public static final int MAX_FREQUENCE = 2079997 + 80000;
-
-  /**
-   * compare two arrays starting at the specified offsets.
-   * 
-   * @param larray left array
-   * @param lstartIndex start offset into larray
-   * @param rarray right array
-   * @param rstartIndex start offset into rarray
-   * @return 0 if the arrays are equal，1 if larray > rarray, -1 if larray < rarray
-   */
-  public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
-      int rstartIndex) {
-
-    if (larray == null) {
-      if (rarray == null || rstartIndex >= rarray.length)
-        return 0;
-      else
-        return -1;
-    } else {
-      // larray != null
-      if (rarray == null) {
-        if (lstartIndex >= larray.length)
-          return 0;
-        else
-          return 1;
-      }
-    }
-
-    int li = lstartIndex, ri = rstartIndex;
-    while (li < larray.length && ri < rarray.length && larray[li] == rarray[ri]) {
-      li++;
-      ri++;
-    }
-    if (li == larray.length) {
-      if (ri == rarray.length) {
-        // Both arrays are equivalent, return 0.
-        return 0;
-      } else {
-        // larray < rarray because larray has ended first.
-        return -1;
-      }
-    } else {
-      // differing lengths
-      if (ri == rarray.length) {
-        // larray > rarray because rarray has ended first.
-        return 1;
-      } else {
-        // determine by comparison
-        if (larray[li] > rarray[ri])
-          return 1;
-        else
-          return -1;
-      }
-    }
-  }
-
-  /**
-   * Compare two arrays, starting at the specified offsets, but treating shortArray as a prefix to longArray.
-   * As long as shortArray is a prefix of longArray, return 0.
-   * Otherwise, behave as {@link Utility#compareArray(char[], int, char[], int)}
-   * 
-   * @param shortArray prefix array
-   * @param shortIndex offset into shortArray
-   * @param longArray long array (word)
-   * @param longIndex offset into longArray
-   * @return 0 if shortArray is a prefix of longArray, otherwise act as {@link Utility#compareArray(char[], int, char[], int)}
-   */
-  public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
-      char[] longArray, int longIndex) {
-
-    // a null prefix is a prefix of longArray
-    if (shortArray == null)
-      return 0;
-    else if (longArray == null)
-      return (shortIndex < shortArray.length) ? 1 : 0;
-
-    int si = shortIndex, li = longIndex;
-    while (si < shortArray.length && li < longArray.length
-        && shortArray[si] == longArray[li]) {
-      si++;
-      li++;
-    }
-    if (si == shortArray.length) {
-      // shortArray is a prefix of longArray
-      return 0;
-    } else {
-      // shortArray > longArray because longArray ended first.
-      if (li == longArray.length)
-        return 1;
-      else
-        // determine by comparison
-        return (shortArray[si] > longArray[li]) ? 1 : -1;
-    }
-  }
-
-  /**
-   * Return the internal {@link CharType} constant of a given character. 
-   * @param ch input character
-   * @return constant from {@link CharType} describing the character type.
-   * 
-   * @see CharType
-   */
-  public static int getCharType(char ch) {
-    // Most (but not all!) of these are Han Ideographic Characters
-    if (ch >= 0x4E00 && ch <= 0x9FA5)
-      return CharType.HANZI;
-    if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
-      return CharType.LETTER;
-    if (ch >= 0x0030 && ch <= 0x0039)
-      return CharType.DIGIT;
-    if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '　')
-      return CharType.SPACE_LIKE;
-    // Punctuation Marks
-    if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
-        || (ch >= 0x3001 && ch <= 0x301E))
-      return CharType.DELIMITER;
-
-    // Full-Width range
-    if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
-      return CharType.FULLWIDTH_LETTER;
-    if (ch >= 0xFF10 && ch <= 0xFF19)
-      return CharType.FULLWIDTH_DIGIT;
-    if (ch >= 0xFE30 && ch <= 0xFF63)
-      return CharType.DELIMITER;
-    return CharType.OTHER;
-
-  }
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java	(working copy)
@@ -1,93 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart;
-
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
-
-/**
- * Segment a sentence of Chinese text into words.
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-class WordSegmenter {
-
-  private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
-
-  private SegTokenFilter tokenFilter = new SegTokenFilter();
-
-  /**
-   * Segment a sentence into words with {@link HHMMSegmenter}
-   * 
-   * @param sentence input sentence
-   * @param startOffset start offset of sentence
-   * @return {@link List} of {@link SegToken}
-   */
-  public List<SegToken> segmentSentence(String sentence, int startOffset) {
-
-    List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
-    // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
-    List<SegToken> result = Collections.emptyList();
-    
-    if (segTokenList.size() > 2) // if its not an empty sentence
-      result = segTokenList.subList(1, segTokenList.size() - 1);
-    
-    for (SegToken st : result)
-      convertSegToken(st, sentence, startOffset);
-    
-    return result;
-  }
-
-  /**
-   * Process a {@link SegToken} so that it is ready for indexing.
-   * 
-   * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
-   * 
-   * @param st input {@link SegToken}
-   * @param sentence associated Sentence
-   * @param sentenceStartOffset offset into sentence
-   * @return Lucene {@link SegToken}
-   */
-  public SegToken convertSegToken(SegToken st, String sentence,
-      int sentenceStartOffset) {
-
-    switch (st.wordType) {
-      case WordType.STRING:
-      case WordType.NUMBER:
-      case WordType.FULLWIDTH_NUMBER:
-      case WordType.FULLWIDTH_STRING:
-        st.charArray = sentence.substring(st.startOffset, st.endOffset)
-            .toCharArray();
-        break;
-      default:
-        break;
-    }
-
-    st = tokenFilter.filter(st);
-    st.startOffset += sentenceStartOffset;
-    st.endOffset += sentenceStartOffset;
-    return st;
-  }
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java	(revision 831958)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java	(working copy)
@@ -23,6 +23,7 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -38,10 +39,11 @@
  */
 public final class WordTokenFilter extends TokenFilter {
 
-  private WordSegmenter wordSegmenter;
+  private HHMMSegmenter wordSegmenter;
 
   private Iterator<SegToken> tokenIter;
-
+  private String sentence;
+  private int sentenceOffset;
   private List<SegToken> tokenBuffer;
   
   private TermAttribute termAtt;
@@ -55,7 +57,7 @@
    */
   public WordTokenFilter(TokenStream in) {
     super(in);
-    this.wordSegmenter = new WordSegmenter();
+    this.wordSegmenter = new HHMMSegmenter();
     termAtt = addAttribute(TermAttribute.class);
     offsetAtt = addAttribute(OffsetAttribute.class);
     typeAtt = addAttribute(TypeAttribute.class);
@@ -64,26 +66,30 @@
   public boolean incrementToken() throws IOException {   
     if (tokenIter == null || !tokenIter.hasNext()) {
       // there are no remaining tokens from the current sentence... are there more sentences?
-      if (input.incrementToken()) {
-        // a new sentence is available: process it.
-        tokenBuffer = wordSegmenter.segmentSentence(termAtt.term(), offsetAtt.startOffset());
-        tokenIter = tokenBuffer.iterator();
-        /* 
-         * it should not be possible to have a sentence with 0 words, check just in case.
-         * returning EOS isn't the best either, but its the behavior of the original code.
-         */
-        if (!tokenIter.hasNext())
+      while(true) {
+        if (input.incrementToken()) {
+          sentence = termAtt.term();
+          sentenceOffset = offsetAtt.startOffset();
+          tokenBuffer = wordSegmenter.process(sentence);
+          tokenIter = tokenBuffer.iterator();
+          if (tokenIter.hasNext()) // could be an empty sentence!
+            break;
+        } else {
           return false;
-      } else {
-        return false; // no more sentences, end of stream!
+        }
       }
     } 
     // WordTokenFilter must clear attributes, as it is creating new tokens.
     clearAttributes();
     // There are remaining tokens from the current sentence, return the next one. 
     SegToken nextWord = tokenIter.next();
-    termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
-    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
+    char termBuffer[] = termAtt.termBuffer();
+    int length = nextWord.endOffset - nextWord.startOffset;
+    if (length > termAtt.termLength())
+      termBuffer = termAtt.resizeTermBuffer(length);
+    sentence.getChars(nextWord.startOffset, nextWord.endOffset, termBuffer, 0);
+    termAtt.setTermLength(length);
+    offsetAtt.setOffset(sentenceOffset + nextWord.startOffset, sentenceOffset + nextWord.endOffset);
     typeAtt.setType("word");
     return true;
   }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordType.java	(working copy)
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart;
-
-/**
- * Internal SmartChineseAnalyzer token type constants
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-public class WordType {
-
-  /**
-   * Start of a Sentence
-   */
-  public final static int SENTENCE_BEGIN = 0;
-
-  /**
-   * End of a Sentence
-   */
-  public final static int SENTENCE_END = 1;
-
-  /**
-   * Chinese Word 
-   */
-  public final static int CHINESE_WORD = 2;
-
-  /**
-   * ASCII String
-   */
-  public final static int STRING = 3;
-
-  /**
-   * ASCII Alphanumeric 
-   */
-  public final static int NUMBER = 4;
-
-  /**
-   * Punctuation Symbol
-   */
-  public final static int DELIMITER = 5;
-
-  /**
-   * Full-Width String
-   */
-  public final static int FULLWIDTH_STRING = 6;
-
-  /**
-   * Full-Width Alphanumeric
-   */
-  public final static int FULLWIDTH_NUMBER = 7;
-
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java	(working copy)
@@ -18,12 +18,11 @@
 package org.apache.lucene.analysis.cn.smart.hhmm;
 
 import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
-import java.util.Map;
 
-import org.apache.lucene.analysis.cn.smart.Utility;
+import org.apache.lucene.util.ArrayUtil;
 
 /**
  * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
@@ -38,15 +37,22 @@
  */
 class BiSegGraph {
 
-  private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
-
+  // I think this is stupid and there's a more efficient way.
+  // for now its temporary hack.
+  private double[][] table;
+  private int min[]; // minimum viterbi search bounds
+  private int max[]; // maximum viterbi search bounds
+  
+  private int path[];
+  private double pathWeight[];
+  
+  int tableSize;
   private List<SegToken> segTokenList;
 
   private static BigramDictionary bigramDict = BigramDictionary.getInstance();
 
   public BiSegGraph(SegGraph segGraph) {
-    segTokenList = segGraph.makeIndex();
-    generateBiSegGraph(segGraph);
+    reset(segGraph);
   }
 
   /*
@@ -59,19 +65,17 @@
     double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
 
     int next;
-    char[] idBuffer;
-    // get the list of tokens ordered and indexed
-    segTokenList = segGraph.makeIndex();
+
     // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
     int key = -1;
     List<SegToken> nextTokens = null;
     while (key < maxStart) {
-      if (segGraph.isStartExist(key)) {
+      List<SegToken> tokenList = segGraph.getStartList(key);
+      if (!tokenList.isEmpty()) {
 
-        List<SegToken> tokenList = segGraph.getStartList(key);
-
         // Calculate all tokens for a given key.
-        for (SegToken t1 : tokenList) {
+        for (int i = 0; i < tokenList.size(); i++) {
+          SegToken t1 = tokenList.get(i);
           oneWordFreq = t1.weight;
           next = t1.endOffset;
           nextTokens = null;
@@ -80,24 +84,19 @@
           // If we cannot find the next Token, then go to the end and repeat the same cycle.
           while (next <= maxStart) {
             // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
-            if (segGraph.isStartExist(next)) {
-              nextTokens = segGraph.getStartList(next);
+            nextTokens = segGraph.getStartList(next);
+            if (!nextTokens.isEmpty())
               break;
-            }
             next++;
           }
-          if (nextTokens == null) {
+          if (nextTokens == null || nextTokens.isEmpty()) {
             break;
           }
-          for (SegToken t2 : nextTokens) {
-            idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
-            System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
-            idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
-            System.arraycopy(t2.charArray, 0, idBuffer,
-                t1.charArray.length + 1, t2.charArray.length);
+          for (int j = 0; j < nextTokens.size(); j++) {
+            SegToken t2 = nextTokens.get(j);
 
             // Two linked Words frequency
-            wordPairFreq = bigramDict.getFrequency(idBuffer);
+            wordPairFreq = bigramDict.getFrequency(t1.charArray, t2.charArray);
 
             // Smoothing
 
@@ -109,9 +108,10 @@
                     + (1.0 - smooth)
                     * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
 
-            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
-                t2.index, weight);
-            this.addSegTokenPair(tokenPair);
+            // table[to][from] = weight
+            table[t2.index][t1.index] = weight;
+            min[t2.index] = Math.min(min[t2.index], t1.index);
+            max[t2.index] = Math.max(max[t2.index], t1.index);          
           }
         }
       }
@@ -121,115 +121,79 @@
   }
 
   /**
-   * Returns true if their is a list of token pairs at this offset (index of the second token)
-   * 
-   * @param to index of the second token in the token pair
-   * @return true if a token pair exists
-   */
-  public boolean isToExist(int to) {
-    return tokenPairListTable.get(Integer.valueOf(to)) != null;
-  }
-
-  /**
-   * Return a {@link List} of all token pairs at this offset (index of the second token)
-   * 
-   * @param to index of the second token in the token pair
-   * @return {@link List} of token pairs.
-   */
-  public List<SegTokenPair> getToList(int to) {
-    return tokenPairListTable.get(to);
-  }
-
-  /**
-   * Add a {@link SegTokenPair}
-   * 
-   * @param tokenPair {@link SegTokenPair}
-   */
-  public void addSegTokenPair(SegTokenPair tokenPair) {
-    int to = tokenPair.to;
-    if (!isToExist(to)) {
-      ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
-      newlist.add(tokenPair);
-      tokenPairListTable.put(to, newlist);
-    } else {
-      List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
-      tokenPairList.add(tokenPair);
-    }
-  }
-
-  /**
-   * Get the number of {@link SegTokenPair} entries in the table.
-   * @return number of {@link SegTokenPair} entries
-   */
-  public int getToCount() {
-    return tokenPairListTable.size();
-  }
-
-  /**
    * Find the shortest path with the Viterbi algorithm.
    * @return {@link List}
    */
   public List<SegToken> getShortPath() {
     int current;
-    int nodeCount = getToCount();
-    List<PathNode> path = new ArrayList<PathNode>();
-    PathNode zeroPath = new PathNode();
-    zeroPath.weight = 0;
-    zeroPath.preNode = 0;
-    path.add(zeroPath);
+    int nodeCount = tableSize - 1;
+
     for (current = 1; current <= nodeCount; current++) {
       double weight;
-      List<SegTokenPair> edges = getToList(current);
+      double edges[] = table[current];
 
       double minWeight = Double.MAX_VALUE;
-      SegTokenPair minEdge = null;
-      for (SegTokenPair edge : edges) {
-        weight = edge.weight;
-        PathNode preNode = path.get(edge.from);
-        if (preNode.weight + weight < minWeight) {
-          minWeight = preNode.weight + weight;
-          minEdge = edge;
+      int minEdge = 0;
+      for (int i = min[current]; i <= max[current]; i++) {
+        weight = edges[i];
+        if (weight != 0) {
+          double preNodeWeight = pathWeight[i];
+          if (preNodeWeight + weight < minWeight) {
+            minWeight = preNodeWeight + weight;
+            minEdge = i;
+          }
         }
       }
-      PathNode newNode = new PathNode();
-      newNode.weight = minWeight;
-      newNode.preNode = minEdge.from;
-      path.add(newNode);
+      path[current] = minEdge;
+      pathWeight[current] = minWeight;
     }
 
-    // Calculate PathNodes
-    int preNode, lastNode;
-    lastNode = path.size() - 1;
-    current = lastNode;
-    List<Integer> rpath = new ArrayList<Integer>();
     List<SegToken> resultPath = new ArrayList<SegToken>();
-
-    rpath.add(current);
+    current = current - 1;
+    
+    // the first token is SENTENCE_END, don't add it.
     while (current != 0) {
-      PathNode currentPathNode = (PathNode) path.get(current);
-      preNode = currentPathNode.preNode;
-      rpath.add(Integer.valueOf(preNode));
+      int preNode = path[current];
+      SegToken st = segTokenList.get(preNode);
+      switch(st.wordType) {
+        case WordType.SENTENCE_BEGIN:
+        case WordType.SENTENCE_END:
+        case WordType.DELIMITER:
+          break;
+        default:
+          resultPath.add(st);
+      }
       current = preNode;
     }
-    for (int j = rpath.size() - 1; j >= 0; j--) {
-      Integer idInteger = (Integer) rpath.get(j);
-      int id = idInteger.intValue();
-      SegToken t = segTokenList.get(id);
-      resultPath.add(t);
-    }
+    
+    Collections.reverse(resultPath);
     return resultPath;
-
   }
-
-  public String toString() {
-    StringBuilder sb = new StringBuilder();
-    Collection<ArrayList<SegTokenPair>>  values = tokenPairListTable.values();
-    for (ArrayList<SegTokenPair> segList : values) {
-      for (SegTokenPair pair : segList) {
-        sb.append(pair + "\n");
-      }
+  
+  void reset(SegGraph segGraph) {
+    // get the list of tokens ordered and indexed
+    segTokenList = segGraph.makeIndex();
+    SegToken lastToken = segTokenList.get(segTokenList.size() - 1);
+    tableSize = lastToken.index + 1;
+    if (table == null || tableSize > table.length) {
+      int newSize = ArrayUtil.getNextSize(tableSize);
+      table = new double[newSize][];
+      for (int i = 0; i < newSize; i++)
+        table[i] = new double[newSize];
+      min = new int[newSize];
+      Arrays.fill(min, 0, tableSize, Integer.MAX_VALUE);
+      max = new int[newSize];
+      path = new int[newSize];
+      pathWeight = new double[newSize];
+    } else {
+      for (int i = 0; i < tableSize; i++)
+        Arrays.fill(table[i], 0, tableSize, 0);
+      Arrays.fill(min, 0, tableSize, Integer.MAX_VALUE);
+      Arrays.fill(max, 0, tableSize, 0);
+      Arrays.fill(path, 0, tableSize, 0);
+      Arrays.fill(pathWeight, 0, tableSize, 0);
     }
-    return sb.toString();
+    generateBiSegGraph(segGraph);
   }
 
 }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java	(working copy)
@@ -60,8 +60,6 @@
 
   private int max = 0;
 
-  private int repeat = 0;
-
   // static Logger log = Logger.getLogger(BigramDictionary.class);
 
   public synchronized static BigramDictionary getInstance() {
@@ -236,26 +234,79 @@
     } else
       return -1;
   }
+ 
+  /**
+   * 32-bit FNV Hash Function (on bigrams)
+   * 
+   * @param left left side of bigram
+   * @param right right side of bigram
+   * @return hashcode
+   */
+  private long hash1(char left[], char right[]) {
+    final long p = 1099511628211L;
+    long hash = 0xcbf29ce484222325L;
+    for (int i = 0; i < left.length; i++) {
+      char d = left[i];
+      hash = (hash ^ (d & 0x00FF)) * p;
+      hash = (hash ^ (d >> 8)) * p;
+    }
+    
+    hash = (hash ^ (BigramDictionary.WORD_SEGMENT_CHAR & 0x00FF)) * p;
+    hash = (hash ^ (BigramDictionary.WORD_SEGMENT_CHAR >> 8)) * p;
+    
+    for (int i = 0; i < right.length; i++) {
+      char d = right[i];
+      hash = (hash ^ (d & 0x00FF)) * p;
+      hash = (hash ^ (d >> 8)) * p;
+    }
+    return hash;
+  }
+  
+  /**
+   * djb2 hash algorithm (k=33) (on bigrams)
+   * 
+   * @param left left side of bigram
+   * @param right right side of bigram
+   * @return hashcode
+   */
+  private int hash2(char left[], char right[]) {
+    int hash = 5381;
 
+    /* hash 33 + c */
+    for (int i = 0; i < left.length; i++) {
+      char d = left[i];
+      hash = ((hash << 5) + hash) + d & 0x00FF;
+      hash = ((hash << 5) + hash) + d >> 8;
+    }
+    
+    hash = ((hash << 5) + hash) + BigramDictionary.WORD_SEGMENT_CHAR & 0x00FF;
+    hash = ((hash << 5) + hash) + BigramDictionary.WORD_SEGMENT_CHAR >> 8;
+    
+    for (int i = 0; i < right.length; i++) {
+      char d = right[i];
+      hash = ((hash << 5) + hash) + d & 0x00FF;
+      hash = ((hash << 5) + hash) + d >> 8;
+    }
+    return hash;
+  }
+  
   /*
-   * lookup the index into the frequency array.
+   * lookup the index into the frequency array for a bigram
    */
-  private int getBigramItemIndex(char carray[]) {
-    long hashId = hash1(carray);
+  private int getBigramItemIndex(char left[], char right[]) {
+    long hashId = hash1(left, right);
     int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
-    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+    int hash2 = hash2(left, right) % PRIME_BIGRAM_LENGTH;
     if (hash1 < 0)
       hash1 = PRIME_BIGRAM_LENGTH + hash1;
     if (hash2 < 0)
       hash2 = PRIME_BIGRAM_LENGTH + hash2;
     int index = hash1;
     int i = 1;
-    repeat++;
     while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
         && i < PRIME_BIGRAM_LENGTH) {
       index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
       i++;
-      repeat++;
       if (i > max)
         max = i;
     }
@@ -266,12 +317,14 @@
     } else
       return -1;
   }
-
-  public int getFrequency(char[] carray) {
-    int index = getBigramItemIndex(carray);
+  
+  /**
+   * get the frequency of a bigram.
+   */
+  final int getFrequency(char left[], char right[]) {
+    int index = getBigramItemIndex(left, right);
     if (index != -1)
       return frequencyTable[index];
     return 0;
   }
-
 }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CharType.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CharType.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CharType.java	(working copy)
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn.smart.hhmm;
 
 /**
  * Internal SmartChineseAnalyzer character type constants.
@@ -25,7 +25,7 @@
  * supported anymore in such a case.</font>
  * </p>
  */
-public class CharType {
+class CharType {
 
   /**
    * Punctuation Characters
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java	(working copy)
@@ -19,9 +19,6 @@
 
 import java.util.List;
 
-import org.apache.lucene.analysis.cn.smart.CharType;
-import org.apache.lucene.analysis.cn.smart.Utility;
-import org.apache.lucene.analysis.cn.smart.WordType;
 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
 
 /**
@@ -35,6 +32,21 @@
 public class HHMMSegmenter {
 
   private static WordDictionary wordDict = WordDictionary.getInstance();
+  
+  // static frequencies for constant marker tokens
+  private static final int STRING_FREQUENCY = 
+    wordDict.getFrequency(Utility.STRING_CHAR_ARRAY);
+  private static final int NUMBER_FREQUENCY =
+    wordDict.getFrequency(Utility.NUMBER_CHAR_ARRAY);
+  private static final int SENTENCE_START_FREQUENCY = 
+    wordDict.getFrequency(Utility.START_CHAR_ARRAY);
+  private static final int SENTENCE_END_FREQUENCY =
+    wordDict.getFrequency(Utility.END_CHAR_ARRAY);
+  
+  // reusable word graph
+  private SegGraph segGraph = new SegGraph(10);
+  // reusable bigram graph
+  private BiSegGraph biSegGraph;
 
   /**
    * Create the {@link SegGraph} for a sentence.
@@ -54,7 +66,7 @@
     int wordType;
     char[] charArray;
 
-    SegGraph segGraph = new SegGraph();
+    segGraph.reset(length);
     while (i < length) {
       hasFullWidth = false;
       switch (charTypeArray[i]) {
@@ -63,7 +75,7 @@
           break;
         case CharType.HANZI:
           j = i + 1;
-          wordBuf.delete(0, wordBuf.length());
+          wordBuf.setLength(0);
           // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
           // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
           // cause word division.
@@ -115,7 +127,7 @@
           }
           // Found a Token from i to j. Type is LETTER char string.
           charArray = Utility.STRING_CHAR_ARRAY;
-          frequency = wordDict.getFrequency(charArray);
+          frequency = STRING_FREQUENCY;
           wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
           token = new SegToken(charArray, i, j, wordType, frequency);
           segGraph.addToken(token);
@@ -133,7 +145,7 @@
           }
           // Found a Token from i to j. Type is NUMBER char string.
           charArray = Utility.NUMBER_CHAR_ARRAY;
-          frequency = wordDict.getFrequency(charArray);
+          frequency = NUMBER_FREQUENCY;
           wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
           token = new SegToken(charArray, i, j, wordType, frequency);
           segGraph.addToken(token);
@@ -153,7 +165,7 @@
           // Treat the unrecognized char symbol as unknown string.
           // For example, any symbol not in GB2312 is treated as one of these.
           charArray = Utility.STRING_CHAR_ARRAY;
-          frequency = wordDict.getFrequency(charArray);
+          frequency = STRING_FREQUENCY;
           token = new SegToken(charArray, i, j, WordType.STRING, frequency);
           segGraph.addToken(token);
           i = j;
@@ -163,13 +175,13 @@
 
     // Add two more Tokens: "beginning xx beginning"
     charArray = Utility.START_CHAR_ARRAY;
-    frequency = wordDict.getFrequency(charArray);
+    frequency = SENTENCE_START_FREQUENCY;
     token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
     segGraph.addToken(token);
 
     // "end xx end"
     charArray = Utility.END_CHAR_ARRAY;
-    frequency = wordDict.getFrequency(charArray);
+    frequency = SENTENCE_END_FREQUENCY;
     token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
         frequency);
     segGraph.addToken(token);
@@ -202,7 +214,10 @@
    */
   public List<SegToken> process(String sentence) {
     SegGraph segGraph = createSegGraph(sentence);
-    BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+    if (biSegGraph == null)
+      biSegGraph = new BiSegGraph(segGraph);
+    else
+      biSegGraph.reset(segGraph);
     List<SegToken> shortPath = biSegGraph.getShortPath();
     return shortPath;
   }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java	(working copy)
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart.hhmm;
-
-/**
- * SmartChineseAnalyzer internal node representation
- * <p>
- * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
- * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-class PathNode implements Comparable<PathNode> {
-  public double weight;
-
-  public int preNode;
-
-  public int compareTo(PathNode pn) {
-    if (weight < pn.weight)
-      return -1;
-    else if (weight == pn.weight)
-      return 0;
-    else
-      return 1;
-  }
-
-  /**
-   * @see java.lang.Object#hashCode()
-   */
-  public int hashCode() {
-    final int prime = 31;
-    int result = 1;
-    result = prime * result + preNode;
-    long temp;
-    temp = Double.doubleToLongBits(weight);
-    result = prime * result + (int) (temp ^ (temp >>> 32));
-    return result;
-  }
-
-  /**
-   * @see java.lang.Object#equals(java.lang.Object)
-   */
-  public boolean equals(Object obj) {
-    if (this == obj)
-      return true;
-    if (obj == null)
-      return false;
-    if (getClass() != obj.getClass())
-      return false;
-    PathNode other = (PathNode) obj;
-    if (preNode != other.preNode)
-      return false;
-    if (Double.doubleToLongBits(weight) != Double
-        .doubleToLongBits(other.weight))
-      return false;
-    return true;
-  }
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java	(working copy)
@@ -18,9 +18,7 @@
 package org.apache.lucene.analysis.cn.smart.hhmm;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
 /**
  * Graph representing possible tokens at each start offset in the sentence.
@@ -37,19 +35,17 @@
 
   /**
    * Map of start offsets to ArrayList of tokens at that position
+   * Note: offset i is stored as array index i + 1.
+   * This is because the sentence start token is stored at position -1
    */
-  private Map<Integer,ArrayList<SegToken>> tokenListTable = new HashMap<Integer,ArrayList<SegToken>>();
+  private final TokenGraph<SegToken> table;
 
-  private int maxStart = -1;
-
   /**
-   * Returns true if a mapping for the specified start offset exists
-   * 
-   * @param s startOffset
-   * @return true if there are tokens for the startOffset
+   * Create a new SegGraph capable of representing a sentence of maxSize.
    */
-  public boolean isStartExist(int s) {
-    return tokenListTable.get(s) != null;
+  SegGraph(int maxSize) {
+    // sentence size, plus sentence start marker, plus sentence end marker
+    table = new TokenGraph<SegToken>(maxSize + 2);
   }
 
   /**
@@ -59,7 +55,7 @@
    * @return List of tokens at the specified start offset.
    */
   public List<SegToken> getStartList(int s) {
-    return tokenListTable.get(s);
+    return table.get(s + 1);
   }
 
   /**
@@ -68,7 +64,7 @@
    * @return maximum start offset, or -1 if the map is empty.
    */
   public int getMaxStart() {
-    return maxStart;
+    return table.getMax() - 1;
   }
 
   /**
@@ -77,13 +73,14 @@
    */
   public List<SegToken> makeIndex() {
     List<SegToken> result = new ArrayList<SegToken>();
-    int s = -1, count = 0, size = tokenListTable.size();
+    int s = -1, count = 0, size = table.size();
     List<SegToken> tokenList;
     short index = 0;
     while (count < size) {
-      if (isStartExist(s)) {
-        tokenList = tokenListTable.get(s);
-        for (SegToken st : tokenList) {
+      tokenList = getStartList(s);
+      if (!tokenList.isEmpty()) {
+        for (int i = 0; i < tokenList.size(); i++) {
+          SegToken st = tokenList.get(i);
           st.index = index;
           result.add(st);
           index++;
@@ -92,6 +89,7 @@
       }
       s++;
     }
+    
     return result;
   }
 
@@ -100,48 +98,19 @@
    * @param token {@link SegToken}
    */
   public void addToken(SegToken token) {
-    int s = token.startOffset;
-    if (!isStartExist(s)) {
-      ArrayList<SegToken> newlist = new ArrayList<SegToken>();
-      newlist.add(token);
-      tokenListTable.put(s, newlist);
-    } else {
-      List<SegToken> tokenList = tokenListTable.get(s);
-      tokenList.add(token);
-    }
-    if (s > maxStart)
-      maxStart = s;
+    table.add(token.startOffset + 1, token);
   }
 
-  /**
-   * Return a {@link List} of all tokens in the map, ordered by startOffset.
-   * 
-   * @return {@link List} of all tokens in the map.
-   */
-  public List<SegToken> toTokenList() {
-    List<SegToken> result = new ArrayList<SegToken>();
-    int s = -1, count = 0, size = tokenListTable.size();
-    List<SegToken> tokenList;
-
-    while (count < size) {
-      if (isStartExist(s)) {
-        tokenList = tokenListTable.get(s);
-        for (SegToken st : tokenList) {
-          result.add(st);
-        }
-        count++;
-      }
-      s++;
-    }
-    return result;
-  }
-
   public String toString() {
-    List<SegToken> tokenList = this.toTokenList();
+    List<SegToken> tokenList = makeIndex();
     StringBuilder sb = new StringBuilder();
     for (SegToken t : tokenList) {
       sb.append(t + "\n");
     }
     return sb.toString();
   }
+  
+  void reset(int size) {
+    table.reset(size + 2);
+  }
 }
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java	(working copy)
@@ -19,7 +19,7 @@
 
 import java.util.Arrays;
 
-import org.apache.lucene.analysis.cn.smart.WordType; // for javadocs
+import org.apache.lucene.analysis.cn.smart.hhmm.WordType; // for javadocs
 
 /**
  * SmartChineseAnalyzer internal token
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java	(working copy)
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart.hhmm;
-
-import org.apache.lucene.analysis.cn.smart.Utility;
-import org.apache.lucene.analysis.cn.smart.WordType;
-
-/**
- * <p>
- * Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
- * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
- * </p>
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-public class SegTokenFilter {
-
-  /**
-   * Filter an input {@link SegToken}
-   * <p>
-   * Full-width latin will be converted to half-width, then all latin will be lowercased.
-   * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
-   * </p>
-   * 
-   * @param token input {@link SegToken}
-   * @return normalized {@link SegToken}
-   */
-  public SegToken filter(SegToken token) {
-    switch (token.wordType) {
-      case WordType.FULLWIDTH_NUMBER:
-      case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
-        for (int i = 0; i < token.charArray.length; i++) {
-          if (token.charArray[i] >= 0xFF10)
-            token.charArray[i] -= 0xFEE0;
-
-          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
-            token.charArray[i] += 0x0020;
-        }
-        break;
-      case WordType.STRING:
-        for (int i = 0; i < token.charArray.length; i++) {
-          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
-            token.charArray[i] += 0x0020;
-        }
-        break;
-      case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
-        token.charArray = Utility.COMMON_DELIMITER;
-        break;
-      default:
-        break;
-    }
-    return token;
-  }
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java	(working copy)
@@ -1,93 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart.hhmm;
-
-import java.util.Arrays;
-
-/**
- * A pair of tokens in {@link SegGraph}
- * <p><font color="#FF0000">
- * WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental. 
- * The APIs and file formats introduced here might change in the future and will not be 
- * supported anymore in such a case.</font>
- * </p>
- */
-class SegTokenPair {
-
-  public char[] charArray;
-
-  /**
-   * index of the first token in {@link SegGraph}
-   */
-  public int from;
-
-  /**
-   * index of the second token in {@link SegGraph}
-   */
-  public int to;
-
-  public double weight;
-
-  public SegTokenPair(char[] idArray, int from, int to, double weight) {
-    this.charArray = idArray;
-    this.from = from;
-    this.to = to;
-    this.weight = weight;
-  }
-
-  /**
-   * @see java.lang.Object#hashCode()
-   */
-  public int hashCode() {
-    final int prime = 31;
-    int result = 1;
-    for(int i=0;i<charArray.length;i++) {
-      result = prime * result + charArray[i];
-    }
-    result = prime * result + from;
-    result = prime * result + to;
-    long temp;
-    temp = Double.doubleToLongBits(weight);
-    result = prime * result + (int) (temp ^ (temp >>> 32));
-    return result;
-  }
-
-  /**
-   * @see java.lang.Object#equals(java.lang.Object)
-   */
-  public boolean equals(Object obj) {
-    if (this == obj)
-      return true;
-    if (obj == null)
-      return false;
-    if (getClass() != obj.getClass())
-      return false;
-    SegTokenPair other = (SegTokenPair) obj;
-    if (!Arrays.equals(charArray, other.charArray))
-      return false;
-    if (from != other.from)
-      return false;
-    if (to != other.to)
-      return false;
-    if (Double.doubleToLongBits(weight) != Double
-        .doubleToLongBits(other.weight))
-      return false;
-    return true;
-  }
-
-}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/TokenGraph.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/TokenGraph.java	(revision 0)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/TokenGraph.java	(revision 0)
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A generic graph of items indexed by position.
+ */
+final class TokenGraph<E> {
+  private List<ArrayList<E>> table = new ArrayList<ArrayList<E>>();
+  private int size = 0;
+  private int max = 0;
+  
+  /**
+   * Create a new TokenGraph capable of holding <code>size</code> mappings.
+   */
+  TokenGraph(int size) {
+    reset(size);
+  }
+  
+  /**
+   * Add an <code>item</code> at the specified <code>position</code>
+   */
+  void add(int position, E item) {
+    List<E> list = table.get(position);
+    if (list.isEmpty())
+      size++;
+    list.add(item);
+    max = Math.max(max, position);
+  }
+  
+  /**
+   * Get the list of items at the specified <code>position</code>
+   * @return {@link List} of items at this position. 
+   * The list will be empty if no mappings exist.
+   */
+  List<E> get(int position) {
+    return table.get(position);
+  }
+  
+  /**
+   * True if items exist at the specified <code>position</code>
+   */
+  boolean isExists(int position) {
+    return !table.get(position).isEmpty();
+  }
+  
+  /**
+   * Get the number of mappings in this graph.
+   */
+  int size() {
+    return size;
+  }
+  
+  /**
+   * Get the maximum position that has a mapping in this graph.
+   */
+  int getMax() {
+    return max;
+  }
+  
+  /**
+   * Clear the graph, resetting it to a new empty graph of <code>size</code>
+   */
+  void reset(int size) {
+    if (table.size() >= size) {
+      for (int i = 0; i < size; i++)
+        table.get(i).clear();
+    } else {
+      for (int i = 0; i < table.size(); i++)
+        table.get(i).clear();
+      for (int i = table.size(); i < size; i++)
+        table.add(new ArrayList<E>());
+    }
+    this.size = 0;
+    this.max = 0;
+  }
+}

Property changes on: contrib\analyzers\smartcn\src\java\org\apache\lucene\analysis\cn\smart\hhmm\TokenGraph.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/Utility.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/Utility.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/Utility.java	(working copy)
@@ -15,10 +15,8 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn.smart.hhmm;
 
-import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
-
 /**
  * SmartChineseAnalyzer utility constants and methods
  * <p><font color="#FF0000">
@@ -27,7 +25,7 @@
  * supported anymore in such a case.</font>
  * </p>
  */
-public class Utility {
+class Utility {
 
   public static final char[] STRING_CHAR_ARRAY = new String("未##串")
       .toCharArray();
@@ -41,16 +39,6 @@
   public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
 
   /**
-   * Delimiters will be filtered to this character by {@link SegTokenFilter}
-   */
-  public static final char[] COMMON_DELIMITER = new char[] { ',' };
-
-  /**
-   * Space-like characters that need to be skipped: such as space, tab, newline, carriage return.
-   */
-  public static final String SPACES = " 　\t\r\n";
-
-  /**
    * Maximum bigram frequency (used in the smoothing function). 
    */
   public static final int MAX_FREQUENCE = 2079997 + 80000;
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java	(working copy)
@@ -31,7 +31,6 @@
 import java.nio.ByteOrder;
 
 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
-import org.apache.lucene.analysis.cn.smart.Utility;
 
 /**
  * SmartChineseAnalyzer Word Dictionary
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordType.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordType.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordType.java	(working copy)
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.lucene.analysis.cn.smart;
+package org.apache.lucene.analysis.cn.smart.hhmm;
 
 /**
  * Internal SmartChineseAnalyzer token type constants
Index: contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt
===================================================================
--- contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt	(revision 831928)
+++ contrib/analyzers/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt	(working copy)
@@ -1,59 +0,0 @@
-////////// Punctuation tokens to remove ////////////////
-,
-.
-`
--
-_
-=
-?
-'
-|
-"
-(
-)
-{
-}
-[
-]
-<
->
-*
-#
-&
-^
-$
-@
-!
-~
-:
-;
-+
-/
-\
-《
-》
-—
-－
-，
-。
-、
-：
-；
-！
-·
-？
-“
-”
-）
-（
-【
-】
-［
-］
-●
-// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
-　
-
-//////////////// English Stop Words ////////////////
-
-//////////////// Chinese Stop Words ////////////////
Index: contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
===================================================================
--- contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java	(revision 831928)
+++ contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java	(working copy)
@@ -17,27 +17,20 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.UnsupportedEncodingException;
-import java.util.Date;
+import java.util.Set;
 
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.util.Version;
 
 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
   
-  public void testChineseStopWordsDefault() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
+  public void testChineseDefault() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
     String sentence = "我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装" };
     assertAnalyzesTo(ca, sentence, result);
-    // set stop-words from the outer world - must yield same behavior
-    ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet());
-    assertAnalyzesTo(ca, sentence, result);
   }
   
   /*
@@ -45,7 +38,7 @@
    * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly.
    */
   public void testChineseStopWordsDefaultTwoPhrases() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
     String sentence = "我购买了道具和服装。 我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
     assertAnalyzesTo(ca, sentence, result);
@@ -53,49 +46,38 @@
   
   /*
    * This test is the same as the above, except using an ideographic space as a separator.
-   * This tests to ensure the stopwords are working correctly.
    */
   public void testChineseStopWordsDefaultTwoPhrasesIdeoSpace() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
     String sentence = "我购买了道具和服装　我购买了道具和服装。";
     String result[] = { "我", "购买", "了", "道具", "和", "服装", "我", "购买", "了", "道具", "和", "服装" };
     assertAnalyzesTo(ca, sentence, result);
   }
   
   /*
-   * Punctuation is handled in a strange way if you disable stopwords
-   * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
-   * if you don't supply (true) to the constructor, or use a different stopwords list,
-   * then punctuation is indexed.
+   * Check that position increments after stopwords are correct,
+   * when stopfilter is configured with enablePositionIncrements
    */
-  public void testChineseStopWordsOff() throws Exception {
-    Analyzer[] analyzers = new Analyzer[] {
-      new SmartChineseAnalyzer(Version.LUCENE_CURRENT, false),/* doesn't load stopwords */
-      new SmartChineseAnalyzer(Version.LUCENE_CURRENT, null) /* sets stopwords to empty set */};
-    String sentence = "我购买了道具和服装。";
-    String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
-    for (Analyzer analyzer : analyzers) {
-      assertAnalyzesTo(analyzer, sentence, result);
-      assertAnalyzesToReuse(analyzer, sentence, result);
-    }
+  public void testChineseStopWords2() throws Exception {
+    Set<?> stopwords = StopFilter.makeStopSet("了");
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, stopwords), 
+        "我购买了道具和服装",
+        new String[] { "我", "购买", "道具", "和", "服装" },
+        new int[] { 0, 1, 4, 6, 7 },
+        new int[] { 1, 3, 6, 7, 9 },
+        new int[] { 1, 1, 2, 1, 1 });
   }
   
   /*
-   * Check that position increments after stopwords are correct,
-   * when stopfilter is configured with enablePositionIncrements
+   * Test analyzing an empty document
    */
-  public void testChineseStopWords2() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
-    String sentence = "Title:San"; // : is a stopword
-    String result[] = { "titl", "san"};
-    int startOffsets[] = { 0, 6 };
-    int endOffsets[] = { 5, 9 };
-    int posIncr[] = { 1, 2 };
-    assertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr);
+  public void testEmpty() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(ca, "", new String[] {});
   }
   
   public void testChineseAnalyzer() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true);
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);
     String sentence = "我购买了道具和服装。";
     String[] result = { "我", "购买", "了", "道具", "和", "服装" };
     assertAnalyzesTo(ca, sentence, result);
@@ -105,7 +87,7 @@
    * English words are lowercased and porter-stemmed.
    */
   public void testMixedLatinChinese() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装", 
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买 Tests 了道具和服装", 
         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
   }
   
@@ -113,7 +95,7 @@
    * Numerics are parsed as their own tokens
    */
   public void testNumerics() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装1234",
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买 Tests 了道具和服装1234",
       new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
   }
   
@@ -121,7 +103,7 @@
    * Full width alphas and numerics are folded to half-width
    */
   public void testFullWidth() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Ｔｅｓｔｓ 了道具和服装１２３４",
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买 Ｔｅｓｔｓ 了道具和服装１２３４",
         new String[] { "我", "购买", "test", "了", "道具", "和", "服装", "1234"});
   }
   
@@ -129,7 +111,7 @@
    * Presentation form delimiters are removed
    */
   public void testDelimiters() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买︱ Tests 了道具和服装", 
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买︱ Tests 了道具和服装", 
         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
   }
   
@@ -138,7 +120,7 @@
    * (regardless of Unicode category)
    */
   public void testNonChinese() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 روبرتTests 了道具和服装", 
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买 روبرتTests 了道具和服装", 
         new String[] { "我", "购买", "ر", "و", "ب", "ر", "ت", "test", "了", "道具", "和", "服装"});
   }
   
@@ -148,15 +130,15 @@
    * Currently it is being analyzed into single characters...
    */
   public void testOOV() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福·拉扎·吉拉尼",
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "优素福·拉扎·吉拉尼",
       new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
     
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "优素福拉扎吉拉尼",
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "优素福拉扎吉拉尼",
       new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
   }
   
   public void testOffsets() throws Exception {
-    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买了道具和服装",
+    assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT), "我购买了道具和服装",
         new String[] { "我", "购买", "了", "道具", "和", "服装" },
         new int[] { 0, 1, 3, 4, 6, 7 },
         new int[] { 1, 3, 4, 6, 7, 9 });
