Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(working copy)
@@ -31,6 +31,8 @@
 
 public class UserDictionaryTest extends LuceneTestCase {
 
+  // nocommit do we test passing userDict to analyzer...?
+
   private UserDictionary readDict() throws IOException {
     InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
     if (is == null)
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java	(working copy)
@@ -18,8 +18,12 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
   /** This test fails with NPE when the 
@@ -41,15 +45,78 @@
         new int[] { 1, 2, 2,  2 }
       );
   }
-  
+
+  /*
+  public void testDirect() throws IOException {
+    Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT);
+    final String s0 = "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+      "アメリカ低所得者医療援助制度が、今日では、その予算の約３分の１を老人に費やしている。";
+    //final String s = s0 + s0 + s0 + s0 + s0 + s0;
+    final String s = s0;
+
+    TokenStream ts = a.tokenStream("dummy", new StringReader(s));
+    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);    
+    while(ts.incrementToken()) {
+      System.out.println("  tok=" + termAtt);
+    }
+
+    System.out.println("TEST: again");
+    // again
+    ts = a.tokenStream("dummy", new StringReader(s));
+    termAtt = ts.getAttribute(CharTermAttribute.class);    
+    while(ts.incrementToken()) {
+      System.out.println("  tok=" + termAtt);
+    }
+  }
+  */
+
+  // Kansai International Airport (compound token):
+  // 関西国際空港 
+
+  // nocommit
+  /*
+  public void testBasics2() throws IOException {
+    //final String s = "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+    //"アメリカ低所得者医療援助制度が、今日では、その予算の約３分の１を老人に費やしている。";
+    final String s = "関西国際空港。関西国際空港";
+    assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), s,
+                     new String[0],
+                     new int[0],
+                     new int[0],
+                     new int[0]);
+  }
+  */
+
   /**
    * Test that search mode is enabled and working by default
    */
   public void testDecomposition() throws IOException {
+    // Senior software engineer:
     assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア",
-        new String[] { "シニア", "ソフトウェア", "エンジニア" }
+                     new String[] { "シニア",
+                                    "シニアソフトウェアエンジニア",
+                                    "ソフトウェア",
+                                    "エンジニア" }
     );
+
+    // Kansai International Airport:
+    assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "関西国際空港",
+                     new String[] { "関西",
+                                    "関西国際空港", // zero pos inc
+                                    "国際",
+                                    "空港" }
+                     );
+
+    // Konika Minolta Holdings; not quite the right
+    // segmentation (see LUCENE-3726):
+    assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "コニカミノルタホールディングス",
+                     new String[] { "コニカ",
+                                    "コニカミノルタホールディングス", // zero pos inc
+                                    "ミノルタ", 
+                                    "ホールディングス"}
+                     );
   }
+
   
   /**
    * blast random strings against the analyzer
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java	(working copy)
@@ -36,7 +36,8 @@
   private final Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+      //Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, true);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java	(working copy)
@@ -28,7 +28,8 @@
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      //Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, true);
       return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
     }
   };
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java	(working copy)
@@ -26,6 +26,8 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+// nocommit cut these over to KT2
+
 public class SegmenterTest extends LuceneTestCase {
   
   private static Segmenter segmenter;
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java	(working copy)
@@ -33,7 +33,8 @@
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      //Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, true);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
@@ -125,6 +126,9 @@
   public void testSurrogates2() throws IOException {
     int numIterations = atLeast(10000);
     for (int i = 0; i < numIterations; i++) {
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + i);
+      }
       String s = _TestUtil.randomUnicodeString(random, 100);
       TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(working copy)
@@ -27,6 +27,7 @@
 import java.util.Map;
 import java.util.TreeMap;
 
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.Builder;
@@ -159,6 +160,10 @@
     return found ? toIndexArray(result) : EMPTY_RESULT;
   }
   
+  public TokenInfoFST getFST() {
+    return fst;
+  }
+
   private static final int[][] EMPTY_RESULT = new int[0][];
   
   /**
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java	(working copy)
@@ -27,6 +27,7 @@
 import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;
@@ -34,6 +35,7 @@
 public class KuromojiAnalyzer extends StopwordAnalyzerBase {
   private final Segmenter segmenter;
   private final Set<String> stoptags;
+  private final UserDictionary userDict;
   
   public KuromojiAnalyzer(Version matchVersion) {
     this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
@@ -43,7 +45,15 @@
     super(matchVersion, stopwords);
     this.segmenter = segmenter;
     this.stoptags = stoptags;
+    userDict = null;
   }
+
+  public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Set<?> stopwords, Set<String> stoptags) {
+    super(matchVersion, stopwords);
+    this.userDict = userDict;
+    this.stoptags = stoptags;
+    this.segmenter = null;
+  }
   
   public static Set<?> getDefaultStopSet(){
     return DefaultSetHolder.DEFAULT_STOP_SET;
@@ -80,7 +90,9 @@
   
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
+    //Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
+    // nocommit pass userDict
+    Tokenizer tokenizer = new KuromojiTokenizer2(reader, userDict, true, true);
     TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
     stream = new CJKWidthFilter(stream);
     stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java	(revision 0)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java	(working copy)
@@ -0,0 +1,984 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
+import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.FST;
+
+// TODO
+//   - queue could also hold partial FST matches, ie, one
+//     char at a time, but this isn't helpful unless the FST
+//     has weights (vs opaque outputs).  EG it would be
+//     useful (necessary!) for suggester?
+
+// nocommit add toDot and look at 1st pass intersection
+// nocommit tie breaks....
+// nocommit does the bigram cost span start/end...?
+// nocommit break out "reusable" viterbi state
+// nocommit get a toDot PrintStream output working...
+
+// nocommit -- need a test that doesn't pre-split by
+// sentence... ie, we don't BOS/EOS on each sentence
+// break any more... so this can change the results
+// depending on whether ipadic was "trained" with sentence
+// breaks?
+
+// nocommit -- the negative score assigned to user dict
+// matches violates assumption of djikstra's...
+
+/* Uses a rolling Viterbi search to find the least cost
+ * segmentation (path) of the incoming characters.
+ *
+ * @lucene.experimental */
+public final class KuromojiTokenizer2 extends Tokenizer {
+
+  private static final boolean VERBOSE = false;
+
+  private static final int SEARCH_MODE_KANJI_LENGTH = 2;
+
+  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
+
+  private static boolean DO_OUTPUT_COMPOUND = true;
+
+  // nocommit: Mode, searchMode, extendedMode
+
+  private final TokenInfoFST fst;
+  private final TokenInfoDictionary dictionary;
+  private final UnknownDictionary unkDictionary;
+  private final ConnectionCosts costs;
+  private final UserDictionary userDictionary;
+  private final CharacterDefinition characterDefinition;
+
+  private final FST.Arc<Long> arc = new FST.Arc<Long>();
+  private final FST.BytesReader fstReader;
+  private final IntsRef wordIdRef = new IntsRef();
+
+  private final FST.BytesReader userFSTReader;
+  private final TokenInfoFST userFST;
+
+  private Reader reader;
+
+  // Next absolute position to process:
+  private int pos;
+
+  private WrappedCharArray buffer = new WrappedCharArray();
+
+  //private Position lastPosData;
+
+  // index of the last character of unknown word:
+  // nocommit put back:
+  // int unknownWordEndIndex = -1;
+
+  private WrappedPositionArray positions = new WrappedPositionArray();
+
+  private boolean end;
+  private final boolean discardPunctuation;
+  private final boolean searchMode;
+
+  private int lastBackTracePos;
+  private int lastTokenPos;
+
+  // Already parsed but not yet passed to caller:
+  private final List<Token> pending = new ArrayList<Token>();
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
+
+  public KuromojiTokenizer2(Reader input, UserDictionary userDictionary, boolean discardPunctuation, boolean searchMode) {
+    super(input);
+    dictionary = TokenInfoDictionary.getInstance();
+    fst = dictionary.getFST();
+    unkDictionary = UnknownDictionary.getInstance();
+    characterDefinition = unkDictionary.getCharacterDefinition();
+    this.userDictionary = userDictionary;
+    costs = ConnectionCosts.getInstance();
+    fstReader = fst.getBytesReader(0);
+    if (userDictionary != null) {
+      userFST = userDictionary.getFST();
+      userFSTReader = userFST.getBytesReader(0);
+    } else {
+      userFST = null;
+      userFSTReader = null;
+    }
+    this.discardPunctuation = discardPunctuation;
+    this.searchMode = searchMode;
+    buffer.reset(input);
+    resetState();
+  }
+
+  @Override
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    buffer.reset(input);
+    resetState();
+  }
+
+  private void resetState() {
+    positions.reset();
+    // nocommit put back
+    //unknownWordEndIndex = -1;
+    pos = 0;
+    end = false;
+    lastBackTracePos = 0;
+    lastTokenPos = -1;
+
+    // Add BOS:
+    positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
+
+    //lastPosData = null;
+  }
+
+  @Override
+  public final void end() {
+    // set final offset
+    offsetAtt.setOffset(correctOffset(pos), correctOffset(pos));
+  }
+
+  // Holds all back pointers arriving to this position:
+  private final static class Position {
+
+    int pos;
+
+    int count;
+
+    // maybe single int array * 5?
+    int[] costs = new int[4];
+    // nommit rename to lastRightID or pathRightID or something:
+    int[] nodeID = new int[4];
+    int[] backPos = new int[4];
+    int[] backIndex = new int[4];
+    int[] backID = new int[4];
+    Type[] backType = new Type[4];
+
+    public void grow() {
+      costs = ArrayUtil.grow(costs, 1+count);
+      nodeID = ArrayUtil.grow(nodeID, 1+count);
+      backPos = ArrayUtil.grow(backPos, 1+count);
+      backIndex = ArrayUtil.grow(backIndex, 1+count);
+      backID = ArrayUtil.grow(backID, 1+count);
+      final Type[] newBackType = new Type[backID.length];
+      System.arraycopy(backType, 0, newBackType, 0, backType.length);
+      backType = newBackType;
+    }
+
+    public void add(int cost, int nodeID, int backPos, int backIndex, int backID, Type backType) {
+      // nocommit in theory, we should check if nodeID is
+      // already present here, and update it if
+      // so... instead of just always adding:
+      if (count == costs.length) {
+        grow();
+      }
+      this.costs[count] = cost;
+      this.nodeID[count] = nodeID;
+      this.backPos[count] = backPos;
+      this.backIndex[count] = backIndex;
+      this.backID[count] = backID;
+      this.backType[count] = backType;
+      count++;
+    }
+
+    // nocommit maybe?
+    // public void update(int cost, int nodeID, int backPos, int backIndex, int backID)
+  }
+
+  // nocommit absorb into add...?
+  private void add(Dictionary dict, Position posData, int endPos, int wordID, Type type) throws IOException {
+    final int wordCost = dict.getWordCost(wordID);
+    final int leftID = dict.getLeftId(wordID);
+    int leastCost = Integer.MAX_VALUE;
+    int leastIDX = 0;
+    for(int idx=0;idx<posData.count;idx++) {
+      // Cost is path cost so far, plus word cost, plus
+      // bigram cost:
+      final int cost = posData.costs[idx] + wordCost + costs.get(posData.nodeID[idx], leftID);
+      if (cost < leastCost) {
+        leastCost = cost;
+        leastIDX = idx;
+      }
+    }
+
+    if (VERBOSE) {
+      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " wordCat=" + leftID + " tok=" + new String(buffer.get(posData.pos, endPos-posData.pos)));
+    }
+
+    // nocommit ideally we don't have to do this ... just
+    // putting it here to confirm same results as current
+    // segmenter:
+    if (!DO_OUTPUT_COMPOUND && searchMode) {
+      int length = endPos - posData.pos;
+      if (length > SEARCH_MODE_KANJI_LENGTH) {
+        boolean allKanji = true;
+        // check if node consists of only kanji
+        for (int pos = 0; pos < length; pos++) {
+          if (!characterDefinition.isKanji((char) buffer.get(posData.pos + pos))) {
+            allKanji = false;
+            break;
+          }				
+        }
+          
+        final int penalty;
+        if (allKanji) {	// Process only Kanji keywords
+          penalty = (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+        } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+          penalty = (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
+        } else {
+          penalty = 0;
+        }
+        if (VERBOSE) {
+          if (penalty != 0) {
+            System.out.println("        + penalty=" + penalty + " cost=" + (leastCost + penalty));
+          }
+        }
+        leastCost += penalty;
+      }
+    }
+
+    positions.get(endPos).add(leastCost, dict.getRightId(wordID), posData.pos, leastIDX, wordID, type);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    // parse() is able to return w/o producing any new
+    // tokens, when the tokens it had produced were entirely
+    // punctuation.  So we loop here until we get a real
+    // token or we end:
+    while (pending.size() == 0) {
+      if (end) {
+        return false;
+      }
+
+      // Push Viterbi forward some more:
+      parse();
+    }
+
+    final Token token = pending.remove(pending.size()-1);
+
+    int position = token.getPosition();
+    int length = token.getLength();
+    clearAttributes();
+    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
+    offsetAtt.setOffset(correctOffset(position), correctOffset(position+length));
+    basicFormAtt.setToken(token);
+    posAtt.setToken(token);
+    readingAtt.setToken(token);
+    inflectionAtt.setToken(token);
+    if (token.getPosition() == lastTokenPos) {
+      posIncAtt.setPositionIncrement(0);
+    } else {
+      assert token.getPosition() > lastTokenPos;
+      posIncAtt.setPositionIncrement(1);
+    }
+    if (VERBOSE) {
+      System.out.println("    incToken: return token=" + token);
+    }
+    lastTokenPos = token.getPosition();
+    return true;
+  }
+
+  // Acts like a forever growing char[] as you read
+  // characters into it from the provided reader, but
+  // internally it uses a circular buffer to only hold the
+  // characters that haven't been freed yet:
+  private static final class WrappedCharArray {
+
+    // TODO: pull out as standalone oal.util class?
+
+    private Reader reader;
+
+    private char[] buffer = new char[32];
+
+    // Next array index to write to in buffer:
+    private int nextWrite;
+
+    // Next absolute position to read from reader:
+    private int nextPos;
+
+    // How many valid chars (wrapped) are in the buffer:
+    private int count;
+
+    // True if we hit EOF
+    private boolean end;
+    
+    /** Clear array and switch to new reader. */
+    public void reset(Reader reader) {
+      this.reader = reader;
+      nextPos = 0;
+      nextWrite = 0;
+      count = 0;
+      end = false;
+    }
+
+    /* Absolute position read.  NOTE: pos must not jump
+     * ahead by more than 1!  Ie, it's OK to read arbitarily
+     * far back (just not prior to the last {@link
+     * #freeBefore}), but NOT ok to read arbitrarily far
+     * ahead.  Returns -1 if you hit EOF. */
+    public int get(int pos) throws IOException {
+      //System.out.println("    get pos=" + pos + " nextPos=" + nextPos + " count=" + count);
+      if (pos == nextPos) {
+        if (end) {
+          return -1;
+        }
+        final int ch = reader.read();
+        if (ch == -1) {
+          end = true;
+          return -1;
+        }
+        if (count == buffer.length) {
+          // Grow
+          final char[] newBuffer = new char[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_CHAR)];
+          System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length - nextWrite);
+          System.arraycopy(buffer, 0, newBuffer, buffer.length - nextWrite, nextWrite);
+          nextWrite = buffer.length;
+          //System.out.println("buffer: grow from " + buffer.length + " to " + newBuffer.length);
+          buffer = newBuffer;
+        }
+        if (nextWrite == buffer.length) {
+          nextWrite = 0;
+        }
+        buffer[nextWrite++] = (char) ch;
+        count++;
+        nextPos++;
+        return ch;
+      } else {
+        // Cannot read from future (except by 1):
+        assert pos < nextPos;
+
+        // Cannot read from already freed past:
+        assert nextPos - pos <= count;
+
+        final int index = getIndex(pos);
+        return buffer[index];
+      }
+    }
+
+    // For assert:
+    private boolean inBounds(int pos) {
+      return pos < nextPos && pos >= nextPos - count;
+    }
+
+    private int getIndex(int pos) {
+      int index = nextWrite - (nextPos - pos);
+      if (index < 0) {
+        // Wrap:
+        index += buffer.length;
+        assert index >= 0;
+      }
+      return index;
+    }
+
+    public char[] get(int posStart, int length) {
+      assert length > 0;
+      assert inBounds(posStart): "posStart=" + posStart + " length=" + length;
+      //System.out.println("    buffer.get posStart=" + posStart + " len=" + length);
+      
+      final int startIndex = getIndex(posStart);
+      final int endIndex = getIndex(posStart + length);
+      //System.out.println("      startIndex=" + startIndex + " endIndex=" + endIndex);
+
+      final char[] result = new char[length];
+      // nocommit what if entire buffer is requested...?
+      if (endIndex >= startIndex) {
+        System.arraycopy(buffer, startIndex, result, 0, endIndex-startIndex);
+      } else {
+        // Wrapped:
+        final int part1 = buffer.length-startIndex;
+        System.arraycopy(buffer, startIndex, result, 0, part1);
+        System.arraycopy(buffer, 0, result, buffer.length-startIndex, length-part1);
+      }
+      return result;
+    }
+
+    /** Call this to notify us that no chars before this
+     *  absolute position are needed anymore. */
+    public void freeBefore(int pos) {
+      assert pos <= nextPos;
+      count = nextPos - pos;
+      assert count < buffer.length;
+    }
+  }
+
+  // TODO: make generic'd version of this "circular array"?
+  private static final class WrappedPositionArray {
+    private Position[] positions = new Position[8];
+
+    public WrappedPositionArray() {
+      for(int i=0;i<positions.length;i++) {
+        positions[i] = new Position();
+      }
+    }
+
+    // Next array index to write to in positions:
+    private int nextWrite;
+
+    // Next position to write:
+    private int nextPos;
+    
+    // How many valid Position instances are held in the
+    // positions array:
+    private int count;
+
+    public void reset() {
+      nextWrite--;
+      while(count > 0) {
+        if (nextWrite == -1) {
+          nextWrite = positions.length - 1;
+        }
+        positions[nextWrite--].count = 0;
+        count--;
+      }
+      nextWrite = 0;
+      nextPos = 0;
+      count = 0;
+    }
+
+    /** Get Position instance for this absolute position;
+     *  this is allowed to be arbitrarily far "in the
+     *  future" but cannot be before the last freeBefore. */
+    public Position get(int pos) {
+      while(pos >= nextPos) {
+        //System.out.println("count=" + count + " vs len=" + positions.length);
+        if (count == positions.length) {
+          Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
+          System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
+          for(int i=positions.length;i<newPositions.length;i++) {
+            newPositions[i] = new Position();
+          }
+          nextWrite = positions.length;
+          positions = newPositions;
+        }
+        if (nextWrite == positions.length) {
+          nextWrite = 0;
+        }
+        // Should have already been reset:
+        assert positions[nextWrite].count == 0;
+        positions[nextWrite++].pos = nextPos++;
+        count++;
+      }
+      assert inBounds(pos);
+      final int index = getIndex(pos);
+      assert positions[index].pos == pos;
+      return positions[index];
+    }
+
+    public int getNextPos() {
+      return nextPos;
+    }
+
+    // For assert:
+    private boolean inBounds(int pos) {
+      return pos < nextPos && pos >= nextPos - count;
+    }
+
+    private int getIndex(int pos) {
+      int index = nextWrite - (nextPos - pos);
+      if (index < 0) {
+        index += positions.length;
+      }
+      return index;
+    }
+
+    public void freeBefore(int pos) {
+      final int toFree = count - (nextPos - pos);
+      assert toFree >= 0;
+      assert toFree <= count;
+      int index = nextWrite - count;
+      if (index < 0) {
+        index += positions.length;
+      }
+      for(int i=0;i<toFree;i++) {
+        if (index == positions.length) {
+          index = 0;
+        }
+        //System.out.println("  fb idx=" + index);
+        positions[index].count = 0;
+        index++;
+      }
+      count -= toFree;
+    }
+  }
+
+  /* Incrementally parse some more characters.  This runs
+   * the viterbi search forwards "enough" so that we
+   * generate some more tokens.  How much forward depends on
+   * the chars coming in, since some chars could have
+   * ambiguity in the parsing last for longer.  Once the
+   * ambiguity is resolved then we back trace, produce
+   * the pending tokens, and return. */
+  private void parse() throws IOException {
+    if (VERBOSE) {
+      System.out.println("\nPARSE");
+    }
+
+    while (true) {
+
+      if (buffer.get(pos) == -1) {
+        // End
+        break;
+      }
+
+      final Position posData = positions.get(pos);
+
+      if (pos != lastBackTracePos && posData.count == 1 && positions.getNextPos() == pos+1) {
+        // We are at a "frontier", and only one node is
+        // alive, so whatever the eventual best path is must
+        // come through this node.  So we can safely commit
+        // to the prefix of the best path at this point:
+        backtrace(posData, 0);
+
+        // Re-base cost so we don't risk int overflow:
+        posData.costs[0] = 0;
+
+        if (pending.size() != 0) {
+          return;
+        } else {
+          // nocommit: make punctuation-only testcase
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("\n  advance @ pos=" + pos);
+      }
+
+      if (posData.count == 0) {
+        // No arcs arrive here; move to next position:
+        pos++;
+        if (VERBOSE) {
+          System.out.println("    no arcs in; skip");
+        }
+        continue;
+      }
+
+      if (VERBOSE) {
+        System.out.println("    " + posData.count + " arcs in");
+      }
+
+      // nocommit must also 1) detect a
+      // less-obvious-yet-still-committable backtrace op,
+      // when, even though N > 1 states are/were alive, they
+      // all back through a single state, and also 2) maybe
+      // need to "force" a "when all else fails" backtrace
+
+      //lastPosData = posData;
+
+      boolean anyMatches = false;
+
+      // First try user dict:
+      if (userFST != null) {
+        userFST.getFirstArc(arc);
+        int output = 0;
+        for(int posAhead=posData.pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) {
+            break;
+          }
+          output += arc.output.intValue();
+          if (arc.isFinal()) {
+            add(userDictionary, posData, posAhead+1, output + arc.nextFinalOutput.intValue(), Type.USER);
+            anyMatches = true;
+          }
+        }
+      }
+
+      if (!anyMatches) {
+        fst.getFirstArc(arc);
+        int output = 0;
+
+        for(int posAhead=posData.pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);
+          
+          if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) {
+            break;
+          }
+
+          output += arc.output.intValue();
+
+          if (arc.isFinal()) {
+            dictionary.lookupWordIds(output + arc.nextFinalOutput.intValue(), wordIdRef);
+            if (VERBOSE) {
+              System.out.println("    KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
+            }
+            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+              add(dictionary, posData, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
+              anyMatches = true;
+            }
+          }
+        }
+      }
+
+      // In the case of normal mode, it doesn't process unknown word greedily.
+
+      // nocommit: fix
+      /*
+      if (!searchMode && unknownWordEndIndex > posData.pos) {
+        continue;
+      }
+      */
+      final char firstCharacter = (char) buffer.get(pos);
+      // nocommit -- can't we NOT pursue unk if a known
+      // token "covers" us...?
+      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
+
+        // Find unknown match:
+        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+
+        // NOTE: copied from UnknownDictionary.lookup:
+        int unknownWordLength;
+        if (!characterDefinition.isGroup(firstCharacter)) {
+          unknownWordLength = 1;
+        } else {
+          // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+          unknownWordLength = 1;
+          for (int posAhead=pos+1;;posAhead++) {
+            final int ch = buffer.get(posAhead);
+            if (ch == -1) {
+              break;
+            }
+            if (characterId == characterDefinition.getCharacterClass((char) ch)) {
+              unknownWordLength++;    			
+            } else {
+              break;
+            }
+          }
+        }
+
+        unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+        if (VERBOSE) {
+          System.out.println("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
+        }
+        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+          add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN);
+        }
+
+        // nocommit fixme
+        //unknownWordEndIndex = posData.pos + unknownWordLength;
+      }
+
+      pos++;
+    }
+
+    end = true;
+
+    if (pos > 0) {
+
+      /*
+        if (VERBOSE) {
+        System.out.println("\n  end @ pos=" + pos + " lastPosData=" + lastPosData.pos);
+        }
+      */
+
+      final Position endPosData = positions.get(pos);
+      int leastCost = Integer.MAX_VALUE;
+      int leastIDX = 0;
+      for(int idx=0;idx<endPosData.count;idx++) {
+        // Add EOS cost:
+        final int cost = endPosData.costs[idx] + costs.get(endPosData.nodeID[idx], 0);
+        if (cost < leastCost) {
+          leastCost = cost;
+          leastIDX = idx;
+        }
+      }
+
+      backtrace(endPosData, leastIDX);
+    } else {
+      // No characters in the input string; return no tokens!
+    }
+  }
+
+  // Backtrace from the provided position, back to the last
+  // time we back-traced, accumulating the resulting tokens to
+  // the pending list.  The pending list is then in-reverse
+  // (last token should be returned first).
+  private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
+    if (VERBOSE) {
+      System.out.println("\n  backtrace: " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
+    }
+    final int endPos = endPosData.pos;
+
+    final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);
+
+    int pos = endPos;
+    int bestIDX = fromIDX;
+    Token altToken = null;
+
+    // We trace backwards, so this will be the leftWordID of
+    // the token after the one we are now on:
+    int lastLeftWordID = -1;
+    
+    // nocommit: don't use intermediate Token instance
+    // here... change this to just hold raw back trace info,
+    // then in incrementToken we pull the necessary char[],
+    // and only call freeBefore once we're done iterating
+    // these tokens:
+    while (pos > lastBackTracePos) {
+      //System.out.println("back pos=" + pos);
+      final Position posData = positions.get(pos);
+
+      int backPos = posData.backPos[bestIDX];
+      int length = pos - backPos;
+      Type backType = posData.backType[bestIDX];
+
+      // nocommit turn back on
+      if (DO_OUTPUT_COMPOUND && searchMode) {
+        
+        // In searchMode, if best path had picked a too-long
+        // token, we use the penalty to compute the allowed
+        // max cost of the alternate back-trace.  If we find an
+        // alternate back trace with cost below that
+        // threshold, we pursue it instead (but also output
+        // the long token).
+
+        while (true) {
+
+          if (length > SEARCH_MODE_KANJI_LENGTH) {
+            //System.out.println("  test for alt token length=" + length + " pos=" + pos);
+            boolean allKanji = true;
+            // check if node consists of only kanji
+            for (int pos2 = backPos; pos2 < pos; pos2++) {
+              if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
+                allKanji = false;
+                break;
+              }				
+            }
+            final int penalty;
+            if (allKanji) {	// Process only Kanji keywords
+              penalty = (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+            } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+              penalty = (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
+            } else {
+              penalty = 0;
+            }
+            //System.out.println("  cycle: penalty=" + penalty);
+
+            if (penalty != 0) {
+
+              // Use penalty to set threshold on 2nd best path:
+              int maxCost = posData.costs[bestIDX] + penalty;
+              if (lastLeftWordID != -1) {
+                // Deduct bigram cost from this long token
+                // (we add new bigram cost back in, below):
+                //System.out.println("maxCost=" + maxCost + " - " + costs.get(getDict(posData.backType[bestIDX]).getRightId(posData.backID[bestIDX]),
+                //lastLeftWordID));
+                maxCost -= costs.get(lastLeftWordID,
+                                     getDict(posData.backType[bestIDX]).getRightId(posData.backID[bestIDX]));
+              }
+
+              // See if there's another path, within this
+              // threshold:
+
+              int secondBestIDX = -1;
+              int leastCost = Integer.MAX_VALUE;
+              for(int i=0;i<posData.count;i++) {
+                int cost = posData.costs[i];
+                if (lastLeftWordID != -1) {
+                  //System.out.println("  cost=" + cost + " + " + costs.get(getDict(posData.backType[i]).getRightId(posData.backID[i]),
+                  //lastLeftWordID));
+                  cost += costs.get(lastLeftWordID,
+                                    getDict(posData.backType[i]).getRightId(posData.backID[i]));
+                }
+                if (i != bestIDX && cost <= maxCost &&
+                    posData.backPos[i] > backPos &&
+                    (secondBestIDX == -1 || cost < leastCost)) {
+                  secondBestIDX = i;
+                  leastCost = cost;
+                }
+              }
+
+              if (secondBestIDX != -1) {
+                if (altToken == null) {
+                  // nocommit: there's no guarantee the alt
+                  // token will "join up"?  the 2nd best parse
+                  // may not be "congruent"...
+
+                  // Save this token, to output when this
+                  // alternate path joins back
+                  altToken = new Token(posData.backID[bestIDX],
+                                       fragment,
+                                       backPos - lastBackTracePos,
+                                       length,
+                                       backType,
+                                       backPos,
+                                       getDict(backType));
+                  if (VERBOSE) {
+                    System.out.println("    create alt token: " + new String(buffer.get(backPos, length)));
+                  }
+                }
+
+                // Back-trace through this path instead:
+                bestIDX = secondBestIDX;
+
+                // nocommit hmmm:
+                assert posData.backPos[bestIDX] > backPos: posData.backPos[bestIDX] + " vs " + backPos;
+
+                backPos = posData.backPos[bestIDX];
+                length = pos - backPos;
+                backType = posData.backType[bestIDX];
+              } else {
+                break;
+              }
+            } else {
+              break;
+            }
+          } else {
+            break;
+          }
+        }
+      }
+
+      final int offset = backPos - lastBackTracePos;
+
+      // nocommit -- how come TestQuality doesn't change if
+      // i output the altToken!
+      // nocommit
+      if (false && altToken != null && altToken.getPosition() >= backPos) {
+        // We've backtraced to the position where the
+        // compound token starts; add it now:
+        pending.add(altToken);
+        altToken = null;
+      }
+
+      final Dictionary dict = getDict(backType);
+
+      // nocommit i assume there are no surrogate
+      // punctuation chars!?
+      if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
+        //System.out.println("backPos=" + backPos);
+        // nocommit for userDict matches we have to output
+        // the segmentation:
+        pending.add(new Token(posData.backID[bestIDX],
+                              fragment,
+                              offset,
+                              length,
+                              backType,
+                              backPos,
+                              dict));
+        if (VERBOSE) {
+          System.out.println("    add token=" + pending.get(pending.size()-1));
+        }
+      } else {
+        if (VERBOSE) {
+          System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
+        }
+      }
+
+      // nocommit -- accuracy drops a bit if we do this.... weird
+      //lastLeftWordID = dict.getLeftId(posData.backID[bestIDX]);
+      pos = backPos;
+      bestIDX = posData.backIndex[bestIDX];
+    }
+
+    lastBackTracePos = endPos;
+
+    if (VERBOSE) {
+      System.out.println("  freeBefore pos=" + endPos);
+    }
+    // Notify the circular buffers that we are done with
+    // these positions:
+    buffer.freeBefore(endPos);
+    positions.freeBefore(endPos);
+  }
+
+  // nocommit use Map, like Segmenter
+  private Dictionary getDict(Type type) {
+    if (type == Type.KNOWN) {
+      return dictionary;
+    } else if (type == Type.UNKNOWN) {
+      return unkDictionary;
+    } else {
+      assert type == Type.USER;
+      return userDictionary;
+    }
+  }
+
+  private static final boolean isPunctuation(char ch) {
+    // TODO: somehow this is slowish.. takes ~5% off
+    // chars/msec from Perf.java; maybe we
+    // can spend RAM...
+
+    // nocommit can we call this only when token is len
+    // 1... or it's unknown...?
+    switch(Character.getType(ch)) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
+}

Property changes on: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(working copy)
@@ -42,6 +42,11 @@
     this.position = position;
     this.dictionary = dictionary;
   }
+
+  @Override
+  public String toString() {
+    return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + ")";
+  }
   
   /**
    * @return surfaceForm
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java	(working copy)
@@ -117,7 +117,8 @@
       if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
         continue;
       }
-      
+
+      // For each arc leaving...
       for (ViterbiNode node : startIndexArr[i]) {
         if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
           break;
@@ -126,6 +127,7 @@
         int backwardConnectionId = node.getLeftId();
         int wordCost = node.getWordCost();
         int leastPathCost = DEFAULT_COST;
+        // For each arc arriving...
         for (ViterbiNode leftNode : endIndexArr[i]) {
           if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
             break;
@@ -151,8 +153,10 @@
               
               if (allKanji) {	// Process only Kanji keywords
                 pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+                //System.out.println("    + kanji penalty=" + (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY + " cost=" + pathCost);
               } else if (length > SEARCH_MODE_OTHER_LENGTH) {
                 pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
+                //System.out.println("    + non-kanji penalty=" + (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY + " cost=" + pathCost);
               }
             }
           }
@@ -241,7 +245,7 @@
       int output = 0;
       for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
         int ch = text[suffixStart + endIndex - 1];
-        
+        //System.out.println("    match " + (char) ch);
         if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
           break; // continue to next position
         }
@@ -253,6 +257,7 @@
           dictionary.lookupWordIds(finalOutput, wordIdRef);
           for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
             final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
+            //System.out.println("output=" + finalOutput + " wid=" + wordId);
             ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
             addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
           }
@@ -326,6 +331,8 @@
   private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
     int startNodesCount = startSizeArr[startIndex];
     int endNodesCount = endSizeArr[endIndex];
+
+    //System.out.println("  + " + startIndex + " to " + endIndex);
     
     if (startNodesCount == 0) {
       startIndexArr[startIndex] = new ViterbiNode[10];
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java	(revision 1241079)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java	(working copy)
@@ -29,8 +29,8 @@
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
 import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 /**
  * Tokenizer main class.
@@ -68,7 +68,9 @@
   public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
     final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
     final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
+    // nocommit
     this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
+    //this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, Mode.NORMAL);
     this.split = split;
     
     dictionaryMap.put(Type.KNOWN, dict);
@@ -127,8 +129,9 @@
       } else {
         position = Math.min(indexOfMaru, indexOfTen);				
       }
-      
-      if(position >= 0) {
+
+      // nocommit
+      if(false && position >= 0) {
         splitPositions.add(position);
         currentPosition = position + 1;
       } else {
@@ -152,7 +155,6 @@
    */
   public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
     ArrayList<Token> result = new ArrayList<Token>();
-    
     ViterbiNode[][][] lattice;
     try {
       lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
@@ -170,7 +172,12 @@
       Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
       result.add(token);
     }
-    
+    /*
+    System.out.println("result:");
+    for(Token token : result) {
+      System.out.println("  " + token);
+    }
+    */
     return result;
   }
   
Index: modules/analysis/kuromoji/Perf.java
===================================================================
--- modules/analysis/kuromoji/Perf.java	(revision 0)
+++ modules/analysis/kuromoji/Perf.java	(working copy)
@@ -0,0 +1,44 @@
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer2;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+
+// javac -cp ../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf.java
+
+// java -cp .:../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf
+public class Perf {
+
+  private final static Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, true);
+      //Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+  
+  final static String s0 = "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+    "アメリカ低所得者医療援助制度が、今日では、その予算の約３分の１を老人に費やしている。";
+  final static String s = s0 + s0 + s0 + s0 + s0 + s0;
+
+  public static void main(String[] args) throws Exception {
+    for(int iter=0;iter<10;iter++) {
+      final long t0 = System.currentTimeMillis();
+      long count = 0;
+      final int ITERS = 3000;
+      for(int i=0;i<ITERS;i++) {
+        final TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+        while(ts.incrementToken()) {
+          count++;
+        }
+      }
+      final long t1 = System.currentTimeMillis();
+      System.out.println((t1-t0) + " msec; " + (s0.length()*ITERS/((double) t1-t0)) + " chars/msec (" + count + " tokens)");
+    }
+  }
+}

Property changes on: modules/analysis/kuromoji/Perf.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/src/java/org/apache/lucene/util/fst/FST.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/fst/FST.java	(revision 1241079)
+++ lucene/src/java/org/apache/lucene/util/fst/FST.java	(working copy)
@@ -840,6 +840,7 @@
   }
 
   public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
     final int address = getNodeAddress(node);
     in.pos = address;
     //System.out.println("  readFirstRealTargtArc address="
@@ -936,6 +937,7 @@
   /** Never returns null, but you should never call this if
    *  arc.isLast() is true. */
   public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
 
     // TODO: can't assert this because we call from readFirstArc
     // assert !flag(arc.flags, BIT_LAST_ARC);
@@ -1019,6 +1021,7 @@
    *  This returns null if the arc was not found, else the incoming arc. */
   public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
     assert cachedRootArcs != null;
+    assert in.bytes == bytes;
 
     if (labelToMatch == END_LABEL) {
       if (follow.isFinal()) {
@@ -1225,17 +1228,20 @@
 
   /** Expert */
   public static abstract class BytesReader extends DataInput {
-    int pos;
+    protected int pos;
+    protected final byte[] bytes;
+    protected BytesReader(byte[] bytes, int pos) {
+      this.bytes = bytes;
+      this.pos = pos;
+    }
     abstract void skip(int byteCount);
     abstract void skip(int base, int byteCount);
   }
 
   final static class ReverseBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ReverseBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override
@@ -1262,11 +1268,9 @@
   // TODO: can we use just ByteArrayDataInput...?  need to
   // add a .skipBytes to DataInput.. hmm and .setPosition
   final static class ForwardBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ForwardBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override
Index: lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
===================================================================
--- lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java	(revision 1241079)
+++ lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java	(working copy)
@@ -138,7 +138,8 @@
         assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
         if (finalOffset != null) {
           assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
-          assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
+          assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
+                     offsetAtt.endOffset() <= finalOffset.intValue());
         }
       }
       if (posIncrAtt != null) {
@@ -315,7 +316,7 @@
       if (VERBOSE) {
         System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
       }
-
+      
       int remainder = random.nextInt(10);
       Reader reader = new StringReader(text);
       TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
@@ -344,7 +345,7 @@
       // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
       if (!tokens.isEmpty()) {
         if (VERBOSE) {
-          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
+          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis useCharFilter=" + useCharFilter + " text.length()=" + text.length());
         }
         reader = new StringReader(text);
         ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
