Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java	(revision 1294102)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java	(working copy)
@@ -112,6 +112,8 @@
 
   private int captureCount;
 
+  // TODO: we should set PositionLengthAttr too...
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(working copy)
@@ -23,29 +23,17 @@
 import java.io.Reader;
 import java.io.IOException;
 
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
 
 public class UserDictionaryTest extends LuceneTestCase {
 
-  private UserDictionary readDict() throws IOException {
-    InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
-    if (is == null)
-      throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
-    try {
-      Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
-      return new UserDictionary(reader);
-    } finally {
-      is.close();
-    }
-  }
-  
   @Test
   public void testLookup() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     String s = "関西国際空港に行った";
     int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
     // Length should be three 関西, 国際, 空港
@@ -69,7 +57,7 @@
   
   @Test
   public void testReadings() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
@@ -83,7 +71,7 @@
   
   @Test
   public void testPartOfSpeech() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞
@@ -92,7 +80,7 @@
   
   @Test
   public void testRead() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     assertNotNull(dictionary);		
   }
 }
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java	(working copy)
@@ -18,8 +18,13 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
   /** This test fails with NPE when the 
@@ -41,20 +46,103 @@
         new int[] { 1, 2, 2,  2 }
       );
   }
-  
+
   /**
    * Test that search mode is enabled and working by default
    */
   public void testDecomposition() throws IOException {
-    assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア",
-        new String[] { "シニア", "ソフトウェア", "エンジニア" }
-    );
+
+    final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
+                                            KuromojiAnalyzer.getDefaultStopSet(),
+                                            KuromojiAnalyzer.getDefaultStopTags());
+
+    /*
+    //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
+    TokenStream ts = a.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+    ts.reset();
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    while(ts.incrementToken()) {
+      System.out.println("  " + termAtt.toString());
+    }
+    System.out.println("DONE PARSE\n\n");
+    */
+
+    // Senior software engineer:
+    assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
+                              new String[] { "シニア",
+                                             "シニアソフトウェアエンジニア",
+                                             "ソフトウェア",
+                                             "エンジニア" },
+                              new int[] { 1, 0, 1, 1},
+                              new int[] { 1, 3, 1, 1}
+                              );
+
+    // Kansai International Airport:
+    assertAnalyzesToPositions(a, "関西国際空港",
+                              new String[] { "関西",
+                                             "関西国際空港", // zero pos inc
+                                             "国際",
+                                             "空港" },
+                              new int[] {1, 0, 1, 1},
+                              new int[] {1, 3, 1, 1}
+                              );
+
+    // Konika Minolta Holdings; not quite the right
+    // segmentation (see LUCENE-3726):
+    assertAnalyzesToPositions(a, "コニカミノルタホールディングス",
+                              new String[] { "コニカ",
+                                             "コニカミノルタホールディングス", // zero pos inc
+                                             "ミノルタ", 
+                                             "ホールディングス"},
+                              new int[] {1, 0, 1, 1},
+                              new int[] {1, 3, 1, 1}
+                              );
+
+    // Narita Airport
+    assertAnalyzesToPositions(a, "成田空港",
+                              new String[] { "成田",
+                                             "成田空港",
+                                             "空港" },
+                              new int[] {1, 0, 1},
+                              new int[] {1, 2, 1}
+                              );
+
+    // Kyoto University Baseball Club
+    assertAnalyzesToPositions(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京都大学硬式野球部",
+                     new String[] { "京都大",
+                                    "学",
+                                    "硬式",
+                                    "野球",
+                                    "部" },
+                              new int[] {1, 1, 1, 1, 1},
+                              new int[] {1, 1, 1, 1, 1});
+    // toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
   }
+
   
   /**
    * blast random strings against the analyzer
    */
   public void testRandom() throws IOException {
-    checkRandomData(random, new KuromojiAnalyzer(TEST_VERSION_CURRENT), atLeast(10000));
+    final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
+                                            KuromojiAnalyzer.getDefaultStopSet(),
+                                            KuromojiAnalyzer.getDefaultStopTags());
+    checkRandomData(random, a, atLeast(10000));
   }
+
+  // Copied from TestKuromojiTokenizer, to make sure passing
+  // user dict to analyzer works:
+  public void testUserDict3() throws Exception {
+    // Test entry that breaks into multiple tokens:
+    final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, TestKuromojiTokenizer.readDict(),
+                                            Mode.SEARCH,
+                                            KuromojiAnalyzer.getDefaultStopSet(),
+                                            KuromojiAnalyzer.getDefaultStopTags());
+    assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
+                              new String[] { "a", "b", "cd"  },
+                              new int[] { 0, 1, 2 },
+                              new int[] { 1, 2, 4 },
+                              new Integer(4)
+    );
+  }
 }
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java	(working copy)
@@ -25,18 +25,17 @@
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestExtendedMode extends BaseTokenStreamTestCase {
-  private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
   private final Analyzer analyzer = new Analyzer() {
     
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java	(working copy)
@@ -23,24 +23,24 @@
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
+import java.util.Arrays;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.util.IOUtils;
 
 public class TestSearchMode extends BaseTokenStreamTestCase {
   private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
-  private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
   private final Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
-  
+
   /** Test search mode segmentation */
   public void testSearchSegmentation() throws IOException {
     InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
@@ -63,7 +63,18 @@
         String[] fields = line.split("\t", 2);
         String sourceText = fields[0];
         String[] expectedTokens = fields[1].split("\\s+");
-        assertAnalyzesTo(analyzer, sourceText, expectedTokens);
+        int[] expectedPosIncrs = new int[expectedTokens.length];
+        int[] expectedPosLengths = new int[expectedTokens.length];
+        for(int tokIDX=0;tokIDX<expectedTokens.length;tokIDX++) {
+          if (expectedTokens[tokIDX].endsWith("/0")) {
+            expectedTokens[tokIDX] = expectedTokens[tokIDX].replace("/0", "");
+            expectedPosLengths[tokIDX] = expectedTokens.length-1;
+          } else {
+            expectedPosIncrs[tokIDX] = 1;
+            expectedPosLengths[tokIDX] = 1;
+          }
+        }
+        assertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
       }
     } finally {
       is.close();
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt	(working copy)
@@ -4,3 +4,7 @@
 
 # Custom reading for sumo wrestler
 朝青龍,朝青龍,アサショウリュウ,カスタム人名
+
+# Silly entry:
+abcd,a b cd,foo1 foo2 foo3,bar
+abcdefg,ab cd efg,foo1 foo2 foo4,bar
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java	(working copy)
@@ -28,7 +28,7 @@
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE);
       return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
     }
   };
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java	(working copy)
@@ -1,231 +0,0 @@
-package org.apache.lucene.analysis.kuromoji;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.util.List;
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class SegmenterTest extends LuceneTestCase {
-  
-  private static Segmenter segmenter;
-  
-  @BeforeClass
-  public static void setUpBeforeClass() throws Exception {
-    segmenter = new Segmenter();
-  }
-  
-  @AfterClass
-  public static void afterClass() throws Exception {
-    segmenter = null;
-  }
-  
-  @Test
-  public void testSegmentation() {
-    // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
-    //		String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
-    //		String[] surfaceForms = {
-    //				"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
-    //				"スペース", "ステーション", "に", "行き", "ます", "。",
-    //				"うたがわしい", "。"
-    //		};
-    String input = "スペースステーションに行きます。うたがわしい。";
-    String[] surfaceForms = {
-        "スペース", "ステーション", "に", "行き", "ます", "。",
-        "うたがわしい", "。"
-    };
-    List<Token> tokens = segmenter.tokenize(input);
-    assertTrue(tokens.size() == surfaceForms.length);
-    for (int i = 0; i < tokens.size(); i++) {
-      assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
-    }
-  }
-  
-  @Test
-  public void testReadings() {
-    List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
-    assertEquals(6, tokens.size());
-    assertEquals("スシ", tokens.get(0).getReading());
-    assertEquals("ガ",    tokens.get(1).getReading());
-    assertEquals("タベ", tokens.get(2).getReading());
-    assertEquals("タイ",  tokens.get(3).getReading());
-    assertEquals("デス", tokens.get(4).getReading());
-    assertEquals("。", tokens.get(5).getReading());
-  }
-  
-  @Test
-  public void testReadings2() {
-    List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
-    assertEquals(9, tokens.size());
-    assertEquals("オオク", tokens.get(0).getReading());
-    assertEquals("ノ", tokens.get(1).getReading());
-    assertEquals("ガクセイ", tokens.get(2).getReading());
-    assertEquals("ガ", tokens.get(3).getReading());
-    assertEquals("シケン", tokens.get(4).getReading());
-    assertEquals("ニ", tokens.get(5).getReading());
-    assertEquals("オチ", tokens.get(6).getReading());
-    assertEquals("タ", tokens.get(7).getReading());
-    assertEquals("。", tokens.get(8).getReading());
-  }
-  
-  @Test
-  public void testPronunciations() {
-    List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
-    assertEquals(6, tokens.size());
-    assertEquals("スシ", tokens.get(0).getPronunciation());
-    assertEquals("ガ",    tokens.get(1).getPronunciation());
-    assertEquals("タベ", tokens.get(2).getPronunciation());
-    assertEquals("タイ",  tokens.get(3).getPronunciation());
-    assertEquals("デス", tokens.get(4).getPronunciation());
-    assertEquals("。", tokens.get(5).getPronunciation());
-  }
-  
-  @Test
-  public void testPronunciations2() {
-    List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
-    assertEquals(9, tokens.size());
-    // pronunciation differs from reading here
-    assertEquals("オーク", tokens.get(0).getPronunciation());
-    assertEquals("ノ", tokens.get(1).getPronunciation());
-    assertEquals("ガクセイ", tokens.get(2).getPronunciation());
-    assertEquals("ガ", tokens.get(3).getPronunciation());
-    assertEquals("シケン", tokens.get(4).getPronunciation());
-    assertEquals("ニ", tokens.get(5).getPronunciation());
-    assertEquals("オチ", tokens.get(6).getPronunciation());
-    assertEquals("タ", tokens.get(7).getPronunciation());
-    assertEquals("。", tokens.get(8).getPronunciation());
-  }
-  
-  @Test
-  public void testBasicForms() {
-    List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
-    assertEquals(9, tokens.size());
-    assertNull(tokens.get(0).getBaseForm());
-    assertNull(tokens.get(1).getBaseForm());
-    assertNull(tokens.get(2).getBaseForm());
-    assertNull(tokens.get(3).getBaseForm());
-    assertNull(tokens.get(4).getBaseForm());
-    assertNull(tokens.get(5).getBaseForm());
-    assertEquals(tokens.get(6).getBaseForm(), "ある");
-    assertNull(tokens.get(7).getBaseForm());
-    assertNull(tokens.get(8).getBaseForm());
-  }
-  
-  @Test
-  public void testInflectionTypes() {
-    List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
-    assertEquals(9, tokens.size());
-    assertNull(tokens.get(0).getInflectionType());
-    assertNull(tokens.get(1).getInflectionType());
-    assertNull(tokens.get(2).getInflectionType());
-    assertNull(tokens.get(3).getInflectionType());
-    assertNull(tokens.get(4).getInflectionType());
-    assertNull(tokens.get(5).getInflectionType());
-    assertEquals("五段・ラ行", tokens.get(6).getInflectionType());
-    assertEquals("特殊・マス", tokens.get(7).getInflectionType());
-    assertNull(tokens.get(8).getInflectionType());
-  }
-  
-  @Test
-  public void testInflectionForms() {
-    List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
-    assertEquals(9, tokens.size());
-    assertNull(tokens.get(0).getInflectionForm());
-    assertNull(tokens.get(1).getInflectionForm());
-    assertNull(tokens.get(2).getInflectionForm());
-    assertNull(tokens.get(3).getInflectionForm());
-    assertNull(tokens.get(4).getInflectionForm());
-    assertNull(tokens.get(5).getInflectionForm());
-    assertEquals("連用形", tokens.get(6).getInflectionForm());
-    assertEquals("基本形", tokens.get(7).getInflectionForm());
-    assertNull(tokens.get(8).getInflectionForm());
-  }
-  
-  @Test
-  public void testPartOfSpeech() {
-    List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
-    assertEquals(9, tokens.size());
-    assertEquals("名詞-代名詞-一般",  tokens.get(0).getPartOfSpeech());
-    assertEquals("助詞-係助詞",    tokens.get(1).getPartOfSpeech());
-    assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
-    assertEquals("名詞-サ変接続",   tokens.get(3).getPartOfSpeech());
-    assertEquals("名詞-一般",      tokens.get(4).getPartOfSpeech());
-    assertEquals("助詞-格助詞-一般",  tokens.get(5).getPartOfSpeech());
-    assertEquals("動詞-自立",      tokens.get(6).getPartOfSpeech());
-    assertEquals("助動詞",       tokens.get(7).getPartOfSpeech());
-    assertEquals("記号-句点",      tokens.get(8).getPartOfSpeech());
-  }
-
-  // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
-  // do we have a possibility to actually lookup the first and last word from dictionary?
-  public void testYabottai() {
-    List<Token> tokens = segmenter.tokenize("やぼったい");
-    assertEquals(1, tokens.size());
-    assertEquals("やぼったい", tokens.get(0).getSurfaceFormString());
-  }
-
-  public void testTsukitosha() {
-    List<Token> tokens = segmenter.tokenize("突き通しゃ");
-    assertEquals(1, tokens.size());
-    assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString());
-  }
-
-  public void testBocchan() throws Exception {
-    doTestBocchan(1);
-  }
-  
-  @Test @Nightly
-  public void testBocchanBig() throws Exception {
-    doTestBocchan(100);
-  }
-  
-  private void doTestBocchan(int numIterations) throws Exception {
-    LineNumberReader reader = new LineNumberReader(new InputStreamReader(
-        this.getClass().getResourceAsStream("bocchan.utf-8")));
-    
-    String line = reader.readLine();
-    reader.close();
-    
-    if (VERBOSE) {
-      System.out.println("Test for Bocchan without pre-splitting sentences");
-    }
-    long totalStart = System.currentTimeMillis();
-    for (int i = 0; i < numIterations; i++){
-      segmenter.tokenize(line);
-    }
-    if (VERBOSE) {
-      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
-      System.out.println("Test for Bocchan with pre-splitting sentences");
-    }
-    String[] sentences = line.split("、|。");
-    totalStart = System.currentTimeMillis();
-    for (int i = 0; i < numIterations; i++) {
-      for (String sentence: sentences) {
-        segmenter.tokenize(sentence);       
-      }
-    }
-    if (VERBOSE) {
-      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
-    }
-  }
-}
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java	(working copy)
@@ -18,6 +18,10 @@
  */
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.PrintWriter;
 import java.io.Reader;
 import java.io.StringReader;
 
@@ -25,21 +29,74 @@
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+
+  public static UserDictionary readDict() {
+    InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt");
+    if (is == null) {
+      throw new RuntimeException("Cannot find userdict.txt in test classpath!");
+    }
+    try {
+      try {
+        Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
+        return new UserDictionary(reader);
+      } finally {
+        is.close();
+      }
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    }
+  }
+
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
-  
+
+  private Analyzer analyzerNormal = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  private Analyzer analyzerNoPunct = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  public void testNormalMode() throws Exception {
+    assertAnalyzesTo(analyzerNormal,
+                     "シニアソフトウェアエンジニア",
+                     new String[] {"シニアソフトウェアエンジニア"});
+  }
+
   public void testDecomposition1() throws Exception {
-    assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+    assertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
                          "アメリカ低所得者医療援助制度が、今日では、その予算の約３分の１を老人に費やしている。",
      new String[] { "本来", "は",  "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",      
                     "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある",  "アメリカ", 
@@ -55,7 +112,7 @@
   }
   
   public void testDecomposition2() throws Exception {
-    assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない",
+    assertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない",
       new String[] { "麻薬", "の", "密売", "は", "根こそぎ", "絶やさ", "なけれ", "ば", "なら", "ない" },
       new int[] { 0, 2, 3, 5, 6,  10, 13, 16, 17, 19 },
       new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
@@ -63,7 +120,7 @@
   }
   
   public void testDecomposition3() throws Exception {
-    assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
+    assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。",
       new String[] { "魔女", "狩", "大将", "マシュー",  "ホプキンス" },
       new int[] { 0, 2, 3, 5, 10 },
       new int[] { 2, 3, 5, 9, 15 }
@@ -91,9 +148,32 @@
     ts.close();
   }
 
+  /*
+    // NOTE: intentionally fails!  Just trying to debug this
+    // one input...
+  public void testDecomposition6() throws Exception {
+    assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学",
+      new String[] { "これ", "は", "本", "で", "は", "ない" },
+      new int[] { 0, 2, 3, 4, 5, 6 },
+      new int[] { 2, 3, 4, 5, 6, 8 }
+                     );
+  }
+  */
+
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
-    assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
+    /*
+    //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
+    TokenStream ts = analyzer.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+    ts.reset();
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    while(ts.incrementToken()) {
+      System.out.println("  " + termAtt.toString());
+    }
+    System.out.println("DONE PARSE\n\n");
+    */
+
+    assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
       new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス",  "魔女", "狩", "大将", "マシュー",  "ホプキンス"  },
       new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
       new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
@@ -103,6 +183,7 @@
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER);
   }
   
   public void testLargeDocReliability() throws Exception {
@@ -125,6 +206,9 @@
   public void testSurrogates2() throws IOException {
     int numIterations = atLeast(10000);
     for (int i = 0; i < numIterations; i++) {
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + i);
+      }
       String s = _TestUtil.randomUnicodeString(random, 100);
       TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
@@ -134,22 +218,363 @@
       }
     }
   }
+
+  public void testOnlyPunctuation() throws IOException {
+    TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。"));
+    ts.reset();
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  public void testOnlyPunctuationExtended() throws IOException {
+    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+    ts.reset();
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
   
   // note: test is kinda silly since kuromoji emits punctuation tokens.
   // but, when/if we filter these out it will be useful.
   public void testEnd() throws Exception {
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")),
         new String[] { "これ", "は", "本", "で", "は", "ない" },
         new int[] { 0, 2, 3, 4, 5, 6 },
         new int[] { 2, 3, 4, 5, 6, 8 },
         new Integer(8)
     );
-    
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない    ")),
+
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない    ")),
         new String[] { "これ", "は", "本", "で", "は", "ない"  },
         new int[] { 0, 2, 3, 4, 5, 6, 8 },
         new int[] { 2, 3, 4, 5, 6, 8, 9 },
         new Integer(12)
     );
   }
+
+  public void testUserDict() throws Exception {
+    // Not a great test because w/o userdict.txt the
+    // segmentation is the same:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")),
+                              new String[] { "関西", "国際", "空港", "に", "行っ", "た"  },
+                              new int[] { 0, 2, 4, 6, 7, 9 },
+                              new int[] { 2, 4, 6, 7, 9, 10 },
+                              new Integer(10)
+    );
+  }
+
+  public void testUserDict2() throws Exception {
+    // Better test: w/o userdict the segmentation is different:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")),
+                              new String[] { "朝青龍"  },
+                              new int[] { 0 },
+                              new int[] { 3 },
+                              new Integer(3)
+    );
+  }
+
+  public void testUserDict3() throws Exception {
+    // Test entry that breaks into multiple tokens:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+                              new String[] { "a", "b", "cd"  },
+                              new int[] { 0, 1, 2 },
+                              new int[] { 1, 2, 4 },
+                              new Integer(4)
+    );
+  }
+
+  // HMM: fails (segments as a/b/cd/efghij)... because the
+  // two paths have exactly equal paths (1 KNOWN + 1
+  // UNKNOWN) and we don't seem to favor longer KNOWN /
+  // shorter UNKNOWN matches:
+
+  /*
+  public void testUserDict4() throws Exception {
+    // Test entry that has another entry as prefix
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+                              new String[] { "ab", "cd", "efg", "hij"  },
+                              new int[] { 0, 2, 4, 7 },
+                              new int[] { 2, 4, 7, 10 },
+                              new Integer(10)
+    );
+  }
+  */
+  
+  public void testSegmentation() throws Exception {
+    // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
+    //		String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
+    //		String[] surfaceForms = {
+    //				"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
+    //				"スペース", "ステーション", "に", "行き", "ます", "。",
+    //				"うたがわしい", "。"
+    //		};
+    String input = "スペースステーションに行きます。うたがわしい。";
+    String[] surfaceForms = {
+        "スペース", "ステーション", "に", "行き", "ます", "。",
+        "うたがわしい", "。"
+    };
+    assertAnalyzesTo(analyzer,
+                     input,
+                     surfaceForms);
+  }
+
+  public void testLatticeToDot() throws Exception {
+    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+    final Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+        tokenizer.setGraphvizFormatter(gv2);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+    };
+
+    String input = "スペースステーションに行きます。うたがわしい。";
+    String[] surfaceForms = {
+        "スペース", "ステーション", "に", "行き", "ます", "。",
+        "うたがわしい", "。"
+    };
+    assertAnalyzesTo(analyzer,
+                     input,
+                     surfaceForms);
+    
+    assertTrue(gv2.finish().indexOf("22.0") != -1);
+  }
+
+  private void assertReadings(String input, String... readings) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+    ts.reset();
+    for(String reading : readings) {
+      assertTrue(ts.incrementToken());
+      assertEquals(reading, readingAtt.getReading());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertPronunciations(String input, String... pronunciations) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+    ts.reset();
+    for(String pronunciation : pronunciations) {
+      assertTrue(ts.incrementToken());
+      assertEquals(pronunciation, readingAtt.getPronunciation());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  private void assertBaseForms(String input, String... baseForms) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
+    ts.reset();
+    for(String baseForm : baseForms) {
+      assertTrue(ts.incrementToken());
+      assertEquals(baseForm, baseFormAtt.getBaseForm());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+    ts.reset();
+    for(String inflectionType : inflectionTypes) {
+      assertTrue(ts.incrementToken());
+      assertEquals(inflectionType, inflectionAtt.getInflectionType());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+    ts.reset();
+    for(String inflectionForm : inflectionForms) {
+      assertTrue(ts.incrementToken());
+      assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
+    ts.reset();
+    for(String partOfSpeech : partsOfSpeech) {
+      assertTrue(ts.incrementToken());
+      assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  public void testReadings() throws Exception {
+    assertReadings("寿司が食べたいです。",
+                   "スシ",
+                   "ガ",
+                   "タベ",
+                   "タイ",
+                   "デス",
+                   "。");
+  }
+  
+  public void testReadings2() throws Exception {
+    assertReadings("多くの学生が試験に落ちた。",
+                   "オオク",
+                   "ノ",
+                   "ガクセイ",
+                   "ガ",
+                   "シケン",
+                   "ニ",
+                   "オチ",
+                   "タ",
+                   "。");
+  }
+  
+  public void testPronunciations() throws Exception {
+    assertPronunciations("寿司が食べたいです。",
+                         "スシ",
+                         "ガ",
+                         "タベ",
+                         "タイ",
+                         "デス",
+                         "。");
+  }
+  
+  public void testPronunciations2() throws Exception {
+    // pronunciation differs from reading here
+    assertPronunciations("多くの学生が試験に落ちた。",
+                         "オーク",
+                         "ノ",
+                         "ガクセイ",
+                         "ガ",
+                         "シケン",
+                         "ニ",
+                         "オチ",
+                         "タ",
+                         "。");
+  }
+  
+  public void testBasicForms() throws Exception {
+    assertBaseForms("それはまだ実験段階にあります。",
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    "ある",
+                    null,
+                    null);
+  }
+  
+  public void testInflectionTypes() throws Exception {
+    assertInflectionTypes("それはまだ実験段階にあります。",
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          "五段・ラ行",
+                          "特殊・マス",
+                          null);
+  }
+  
+  public void testInflectionForms() throws Exception {
+    assertInflectionForms("それはまだ実験段階にあります。",
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          "連用形",
+                          "基本形",
+                          null);
+  }
+  
+  public void testPartOfSpeech() throws Exception {
+    assertPartsOfSpeech("それはまだ実験段階にあります。",
+                        "名詞-代名詞-一般",
+                        "助詞-係助詞",
+                        "副詞-助詞類接続",
+                        "名詞-サ変接続",
+                        "名詞-一般",
+                        "助詞-格助詞-一般",
+                        "動詞-自立",
+                        "助動詞",
+                        "記号-句点");
+  }
+
+  // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+  // do we have a possibility to actually lookup the first and last word from dictionary?
+  public void testYabottai() throws Exception {
+    assertAnalyzesTo(analyzer, "やぼったい",
+                     new String[] {"やぼったい"});
+  }
+
+  public void testTsukitosha() throws Exception {
+    assertAnalyzesTo(analyzer, "突き通しゃ",
+                     new String[] {"突き通しゃ"});
+  }
+
+  public void testBocchan() throws Exception {
+    doTestBocchan(1);
+  }
+
+  @Nightly
+  public void testBocchanBig() throws Exception {
+    doTestBocchan(100);
+  }
+  
+  private void doTestBocchan(int numIterations) throws Exception {
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(
+        this.getClass().getResourceAsStream("bocchan.utf-8")));
+    
+    String line = reader.readLine();
+    reader.close();
+    
+    if (VERBOSE) {
+      System.out.println("Test for Bocchan without pre-splitting sentences");
+    }
+
+    /*
+    if (numIterations > 1) {
+      // warmup
+      for (int i = 0; i < numIterations; i++) {
+        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+        ts.reset();
+        while(ts.incrementToken());
+      }
+    }
+    */
+
+    long totalStart = System.currentTimeMillis();
+    for (int i = 0; i < numIterations; i++) {
+      final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+      ts.reset();
+      while(ts.incrementToken());
+    }
+    String[] sentences = line.split("、|。");
+    if (VERBOSE) {
+      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+      System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)");
+    }
+    totalStart = System.currentTimeMillis();
+    for (int i = 0; i < numIterations; i++) {
+      for (String sentence: sentences) {
+        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+        ts.reset();
+        while(ts.incrementToken());
+      }
+    }
+    if (VERBOSE) {
+      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+    }
+  }
 }
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt	(working copy)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt	(working copy)
@@ -25,43 +25,45 @@
 ##
 
 # Kansai Internationl Airport
-関西国際空港	関西 国際 空港
+関西国際空港	関西 関西国際空港/0 国際 空港
 # Narita Airport
-成田空港	成田 空港
+成田空港	成田 成田空港/0 空港
 # Haneda Airport
-羽田空港	羽田 空港
+羽田空港	羽田 羽田空港/0 空港
 # Nara Institute of Science and Technology
-奈良先端科学技術大学院大学	奈良 先端 科学 技術 大学院 大学
+奈良先端科学技術大学院大学	奈良 奈良先端科学技術大学院大学/0 先端 科学 技術 大学院 大学
 # Tokyo University
-東京大学	東京 大学
+東京大学	東京 東京大学/0 大学
 # Kyoto University
-京都大学	京都 大学
+京都大学	京都 京都大学/0 大学
+
+# NOTE: differs from non-compound mode:
 # Kyoto University Baseball Club
-京都大学硬式野球部	京都 大学 硬式 野球 部
+京都大学硬式野球部	京都大 学 硬式 野球 部
 
 ##
 ## Katakana titles
 ##
 
 # Senior Software Engineer
-シニアソフトウェアエンジニア	シニア ソフトウェア エンジニア
+シニアソフトウェアエンジニア	シニア シニアソフトウェアエンジニア/0 ソフトウェア エンジニア
 # Software Engineer
 ソフトウェアエンジニア	ソフトウェア エンジニア
 # Senior Project Manager
-シニアプロジェクトマネジャー	シニア プロジェクト マネジャー
+シニアプロジェクトマネジャー	シニア シニアプロジェクトマネジャー/0 プロジェクト マネジャー
 # Project Manager
 プロジェクトマネジャー	プロジェクト マネジャー
 # Senior Sales Engineer
-シニアセールスエンジニア	シニア セールス エンジニア
+シニアセールスエンジニア	シニア シニアセールスエンジニア/0 セールス エンジニア
 # System Architect
-システムアーキテクト	システム アーキテクト
+システムアーキテクト	システム システムアーキテクト/0 アーキテクト
 # Senior System Architect
-シニアシステムアーキテクト	シニア システム アーキテクト
+シニアシステムアーキテクト	シニア シニアシステムアーキテクト/0 システム アーキテクト
 # System Administrator
 システムアドミニストレータ	システム アドミニストレータ
-システムアドミニストレーター	システム アドミニストレーター
+システムアドミニストレーター	システム システムアドミニストレーター/0 アドミニストレーター
 # Senior System Administrator
-シニアシステムアドミニストレーター	シニア システム アドミニストレーター
+シニアシステムアドミニストレーター	シニア シニアシステムアドミニストレーター/0 システム アドミニストレーター
 
 ##
 ## Company names (several are fictitious)
@@ -70,25 +72,25 @@
 # SoftBank Mobile
 ソフトバンクモバイル	ソフトバンク モバイル
 # Alpine Materials
-アルパインマテリアルズ	アルパイン マテリアルズ
+アルパインマテリアルズ	アルパイン アルパインマテリアルズ/0 マテリアルズ
 # Sapporo Holdings
 サッポロホールディングス	サッポロ ホールディングス
 # Yamada Corporation
-ヤマダコーポレーション	ヤマダ コーポレーション
+ヤマダコーポレーション	ヤマダ ヤマダコーポレーション/0 コーポレーション
 # Canon Semiconductor equipement	NOTE: Semiconductor becomes semi + conductor
-キヤノンセミコンダクターエクィップメント	キヤノン セミ コンダクター エクィップメント
+キヤノンセミコンダクターエクィップメント	キヤノン キヤノンセミコンダクターエクィップメント/0 セミ コンダクター エクィップメント
 # Orental Chain
-オリエンタルチエン	オリエンタル チエン
+オリエンタルチエン	オリエンタル オリエンタルチエン/0 チエン
 # Ally Projects Japan	NOTE: Becomes one token as プロジェクツ is not in IPADIC
 アーリープロジェクツジャパン	アーリープロジェクツジャパン
 # Peter Pan Corporation
-ピーターパンコーポレーション	ピーター パン コーポレーション
+ピーターパンコーポレーション	ピーター ピーターパンコーポレーション/0 パン コーポレーション
 # AIM Create
 エイムクリエイツ	エイムクリエイツ
 # Mars Engineering
-マースエンジニアリング	マース エンジニアリング
+マースエンジニアリング	マース マースエンジニアリング/0 エンジニアリング
 # Fuji Protein Technology
-フジプロテインテクノロジー	フジ プロテイン テクノロジー
+フジプロテインテクノロジー	フジ フジプロテインテクノロジー/0 プロテイン テクノロジー
 
 ##
 ## Person names
@@ -100,7 +102,7 @@
 スティーブジョブズ	スティーブ ジョブズ
 # Harry Potter	NOTE: Becomes one token (short word)
 ハリーポッター	ハリーポッター
-# Bill Gates	NOTE: Becomes one token (short work)
+# Bill Gates	NOTE: Becomes one token (short word)
 ビルゲイツ	ビルゲイツ
 # Sean Connery	NOTE: Becomes one token (okay)
 ショーンコネリー	ショーンコネリー
@@ -133,8 +135,8 @@
 ##
 
 # JT Engineering	NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
-ジェイティエンジニアリング	ジェイ ティエン ジニア リング
+ジェイティエンジニアリング	ジェイ ジェイティエンジニアリング/0 ティエン ジニア リング
 # Anchovy pasta	NOTE: Become Anch yvipasta
-アンチョビパスタ	アンチ ョビパスタ
+アンチョビパスタ	アンチ アンチョビパスタ/0 ョビパスタ
 # Surprise gift	NOTE: Becomes one token (surprise not in IPADIC)
 サプライズギフト	サプライズギフト

Property changes on: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(working copy)
@@ -27,6 +27,7 @@
 import java.util.Map;
 import java.util.TreeMap;
 
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.Builder;
@@ -159,6 +160,10 @@
     return found ? toIndexArray(result) : EMPTY_RESULT;
   }
   
+  public TokenInfoFST getFST() {
+    return fst;
+  }
+
   private static final int[][] EMPTY_RESULT = new int[0][];
   
   /**
@@ -181,6 +186,10 @@
     }
     return result.toArray(new int[result.size()][]);
   }
+
+  public int[] lookupSegmentation(int phraseID) {
+    return segmentations[phraseID];
+  }
   
   @Override
   public int getLeftId(int wordId) {
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java	(working copy)
@@ -27,21 +27,25 @@
 import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;
 
 public class KuromojiAnalyzer extends StopwordAnalyzerBase {
-  private final Segmenter segmenter;
+  private final Mode mode;
   private final Set<String> stoptags;
+  private final UserDictionary userDict;
   
   public KuromojiAnalyzer(Version matchVersion) {
-    this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+    this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
   }
   
-  public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
+  public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
     super(matchVersion, stopwords);
-    this.segmenter = segmenter;
+    this.userDict = userDict;
+    this.mode = mode;
     this.stoptags = stoptags;
   }
   
@@ -79,7 +83,7 @@
   
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
+    Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode);
     TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
     stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
     stream = new CJKWidthFilter(stream);
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java	(revision 0)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java	(working copy)
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Position;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.WrappedPositionArray;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+
+
+// TODO: would be nice to show 2nd best path in a diff't
+// color...
+
+public class GraphvizFormatter {
+  
+  private final static String BOS_LABEL = "BOS";
+  
+  private final static String EOS_LABEL = "EOS";
+  
+  private final static String FONT_NAME = "Helvetica";
+  
+  private final ConnectionCosts costs;
+  
+  private final Map<String, String> bestPathMap;
+  
+  private final StringBuilder sb = new StringBuilder();
+  
+  public GraphvizFormatter(ConnectionCosts costs) {
+    this.costs = costs;
+    this.bestPathMap = new HashMap<String, String>();
+    sb.append(formatHeader());
+    sb.append("  init [style=invis]\n");
+    sb.append("  init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
+  }
+
+  public String finish() {
+    sb.append(formatTrailer());
+    return sb.toString();
+  }
+
+  // Backtraces another incremental fragment:
+  void onBacktrace(KuromojiTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
+    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
+    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+    if (isEnd) {
+      sb.append("  fini [style=invis]\n");
+      sb.append("  ");
+      sb.append(getNodeID(endPosData.pos, fromIDX));
+      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
+    }
+  }
+
+  // Records which arcs make up the best bath:
+  private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+    bestPathMap.clear();
+
+    int pos = endPosData.pos;
+    int bestIDX = fromIDX;
+    while (pos > startPos) {
+      final Position posData = positions.get(pos);
+
+      final int backPos = posData.backPos[bestIDX];
+      final int backIDX = posData.backIndex[bestIDX];
+
+      final String toNodeID = getNodeID(pos, bestIDX);
+      final String fromNodeID = getNodeID(backPos, backIDX);
+      
+      assert !bestPathMap.containsKey(fromNodeID);
+      assert !bestPathMap.containsValue(toNodeID);
+      bestPathMap.put(fromNodeID, toNodeID);
+      pos = backPos;
+      bestIDX = backIDX;
+    }
+  }
+  
+  private String formatNodes(KuromojiTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
+
+    StringBuilder sb = new StringBuilder();
+    // Output nodes
+    for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        sb.append("  ");
+        sb.append(getNodeID(pos, idx));
+        sb.append(" [label=\"");
+        sb.append(pos);
+        sb.append(": ");
+        sb.append(posData.lastRightID[idx]);
+        sb.append("\"]\n");
+      }
+    }
+
+    // Output arcs
+    for (int pos = endPosData.pos; pos > startPos; pos--) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        final Position backPosData = positions.get(posData.backPos[idx]);
+        final String toNodeID = getNodeID(pos, idx);
+        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+
+        sb.append("  ");
+        sb.append(fromNodeID);
+        sb.append(" -> ");
+        sb.append(toNodeID);
+
+        final String attrs;
+        if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
+          // This arc is on best path
+          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
+        } else {
+          attrs = "";
+        }
+
+        final Dictionary dict = tok.getDict(posData.backType[idx]);
+        final int wordCost = dict.getWordCost(posData.backID[idx]);
+        final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
+                                     dict.getLeftId(posData.backID[idx]));
+
+        final String surfaceForm = new String(fragment,
+                                              posData.backPos[idx] - startPos,
+                                              pos - posData.backPos[idx]);
+        
+        sb.append(" [label=\"");
+        sb.append(surfaceForm);
+        sb.append(' ');
+        sb.append(wordCost);
+        if (bgCost >= 0) {
+          sb.append('+');
+        }
+        sb.append(bgCost);
+        sb.append("\"");
+        sb.append(attrs);
+        sb.append("]\n");
+      }
+    }
+    return sb.toString();
+  }
+  
+  private String formatHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+    //sb.append("  // A2 paper size\n");
+    //sb.append("  size = \"34.4,16.5\";\n");
+    //sb.append("  // try to fill paper\n");
+    //sb.append("  ratio = fill;\n");
+    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
+  }
+  
+  private String formatTrailer() {
+    return "}";
+  }
+  
+  private String getNodeID(int pos, int idx) {
+    return pos + "." + idx;
+  }
+}

Property changes on: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(working copy)
@@ -17,8 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 public class Token {
   private final Dictionary dictionary;
@@ -30,6 +30,7 @@
   private final int length;
   
   private final int position;
+  private int positionLength;
   
   private final Type type;
   
@@ -40,8 +41,14 @@
     this.length = length;
     this.type = type;
     this.position = position;
+    this.positionLength = positionLength;
     this.dictionary = dictionary;
   }
+
+  @Override
+  public String toString() {
+    return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
+  }
   
   /**
    * @return surfaceForm
@@ -144,4 +151,21 @@
   public int getPosition() {
     return position;
   }
+
+  /**
+   * Set the position length (in tokens) of this token.  For normal
+   * tokens this is 1; for compound tokens it's > 1.
+   */
+  public void setPositionLength(int positionLength) {
+    this.positionLength = positionLength;
+  }
+  
+  /**
+   * Get the length (in tokens) of this token.  For normal
+   * tokens this is 1; for compound tokens it's > 1.
+   * @return position length of token
+   */
+  public int getPositionLength() {
+    return positionLength;
+  }
 }
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java	(working copy)
@@ -1,365 +0,0 @@
-package org.apache.lucene.analysis.kuromoji.viterbi;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.LinkedList;
-import java.util.List;
-
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
-import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
-import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
-import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.fst.FST;
-
-public class Viterbi {
-  
-  private final TokenInfoFST fst;
-
-  private final TokenInfoDictionary dictionary;
-  
-  private final UnknownDictionary unkDictionary;
-  
-  private final ConnectionCosts costs;
-  
-  private final UserDictionary userDictionary;
-  
-  private final CharacterDefinition characterDefinition;
-  
-  private final boolean useUserDictionary;
-  
-  private final boolean searchMode;
-  
-  private final boolean extendedMode;
-  
-  private static final int DEFAULT_COST = 10000000;
-  
-  private static final int SEARCH_MODE_KANJI_LENGTH = 2;
-
-  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
-
-  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
-
-  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
-  
-  private static final char[] BOS = "BOS".toCharArray();
-  
-  private static final char[] EOS = "EOS".toCharArray();
-  
-  /**
-   * Constructor
-   */
-  public Viterbi(TokenInfoDictionary dictionary,
-      UnknownDictionary unkDictionary,
-      ConnectionCosts costs,
-      UserDictionary userDictionary,
-      Mode mode) {
-    this.dictionary = dictionary;
-    this.fst = dictionary.getFST();
-    this.unkDictionary = unkDictionary;
-    this.costs = costs;
-    this.userDictionary = userDictionary;
-    if(userDictionary == null) {
-      this.useUserDictionary = false;
-    } else {
-      this.useUserDictionary = true;
-    }
-    
-    switch(mode){
-      case SEARCH:
-        searchMode = true;
-        extendedMode = false;
-        break;
-      case EXTENDED:
-        searchMode = true;
-        extendedMode = true;
-        break;
-      default:
-        searchMode = false;
-        extendedMode = false;
-        break;
-    }
-    
-    this.characterDefinition = unkDictionary.getCharacterDefinition();
-  }
-  
-  /**
-   * Find best path from input lattice.
-   * @param lattice the result of build method
-   * @return	List of ViterbiNode which consist best path 
-   */
-  public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
-    ViterbiNode[][] startIndexArr = lattice[0];
-    ViterbiNode[][] endIndexArr = lattice[1];
-    
-    for (int i = 1; i < startIndexArr.length; i++){
-      
-      if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
-        continue;
-      }
-      
-      for (ViterbiNode node : startIndexArr[i]) {
-        if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
-          break;
-        }
-        
-        int backwardConnectionId = node.getLeftId();
-        int wordCost = node.getWordCost();
-        int leastPathCost = DEFAULT_COST;
-        for (ViterbiNode leftNode : endIndexArr[i]) {
-          if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
-            break;
-          }
-          
-          int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
-          
-          // "Search mode". Add extra costs if it is long node.
-          if (searchMode) {
-            //						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
-            char[] surfaceForm = node.getSurfaceForm();
-            int offset = node.getOffset();
-            int length = node.getLength();
-            if (length > SEARCH_MODE_KANJI_LENGTH) {
-              boolean allKanji = true;
-              // check if node consists of only kanji
-              for (int pos = 0; pos < length; pos++) {
-                if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
-                  allKanji = false;
-                  break;
-                }				
-              }
-              
-              if (allKanji) {	// Process only Kanji keywords
-                pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
-              } else if (length > SEARCH_MODE_OTHER_LENGTH) {
-                pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
-              }
-            }
-          }
-          
-          if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
-            leastPathCost = pathCost;
-            node.setPathCost(leastPathCost);
-            node.setLeftNode(leftNode);
-          }					
-        }
-      }
-    }
-    
-    // track best path
-    ViterbiNode node = endIndexArr[0][0];	// EOS
-    LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
-    result.add(node);
-    while (true) {
-      ViterbiNode leftNode = node.getLeftNode();
-      if (leftNode == null) {
-        break;
-      }
-      
-      // EXTENDED mode convert unknown word into unigram node
-      if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
-        byte unigramWordId = CharacterDefinition.NGRAM;
-        int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
-        int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
-        int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
-        char[] surfaceForm = leftNode.getSurfaceForm();
-        int offset = leftNode.getOffset();
-        int length = leftNode.getLength();
-        for (int i = length - 1; i >= 0; i--) {
-          int charLen = 1;
-          if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
-            i--;
-            charLen = 2;
-          }
-          ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
-          result.addFirst(uniGramNode);
-        }
-      } else {
-        result.addFirst(leftNode);		
-      }
-      node = leftNode;
-    }
-    
-    return result;
-  }
-
-  /**
-   * Build lattice from input text
-   * @param text
-   */
-  public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
-    ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
-    ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
-    int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
-    int[] endSizeArr = new int[length + 2];   // array to keep ViterbiNode count in endIndexArr
-    FST.Arc<Long> arc = new FST.Arc<Long>();
-    ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
-    addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-    
-    final FST.BytesReader fstReader = fst.getBytesReader(0);
-
-    // Process user dictionary;
-    if (useUserDictionary) {
-      processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-    }
-    
-    int unknownWordEndIndex = -1;	// index of the last character of unknown word
-    
-    final IntsRef wordIdRef = new IntsRef();
-    
-    for (int startIndex = 0; startIndex < length; startIndex++) {
-      // If no token ends where current token starts, skip this index
-      if (endSizeArr[startIndex + 1] == 0) {
-        continue;
-      }
-      
-      int suffixStart = offset + startIndex;
-      int suffixLength = length - startIndex;
-      
-      boolean found = false;
-      arc = fst.getFirstArc(arc);
-      int output = 0;
-      for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
-        int ch = text[suffixStart + endIndex - 1];
-        
-        if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
-          break; // continue to next position
-        }
-        output += arc.output.intValue();
-
-        if (arc.isFinal()) {
-          final int finalOutput = output + arc.nextFinalOutput.intValue();
-          found = true; // Don't produce unknown word starting from this index
-          dictionary.lookupWordIds(finalOutput, wordIdRef);
-          for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
-            final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
-            ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
-            addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-          }
-        }
-      }
-      
-      // In the case of normal mode, it doesn't process unknown word greedily.
-      if(!searchMode && unknownWordEndIndex > startIndex){
-        continue;
-      }
-      
-      // Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
-      int unknownWordLength = 0;
-      char firstCharacter = text[suffixStart];
-      boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
-      if (isInvoke){	// Process "invoke"
-        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
-      } else if (found == false){	// Process not "invoke"
-        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);				
-      }
-      
-      if (unknownWordLength > 0) {      // found unknown word
-        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
-        unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
-        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
-          final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
-          ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
-          addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-        }
-        unknownWordEndIndex = startIndex + unknownWordLength;
-      }
-    }
-    
-    ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
-    addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
-    
-    ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
-    
-    return result;
-  }
-  
-  /**
-   * Find token(s) in input text and set found token(s) in arrays as normal tokens
-   * @param text	
-   * @param startIndexArr
-   * @param endIndexArr
-   * @param startSizeArr
-   * @param endSizeArr
-   */
-  private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
-    int[][] result = userDictionary.lookup(text, offset, len);
-    for(int[] segmentation : result) {
-      int wordId = segmentation[0];
-      int index = segmentation[1];
-      int length = segmentation[2];
-      ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
-      addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-    }
-  }
-  
-  /**
-   * Add node to arrays and increment count in size array
-   * @param node
-   * @param startIndex
-   * @param endIndex
-   * @param startIndexArr
-   * @param endIndexArr
-   * @param startSizeArr
-   * @param endSizeArr
-   */
-  private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
-    int startNodesCount = startSizeArr[startIndex];
-    int endNodesCount = endSizeArr[endIndex];
-    
-    if (startNodesCount == 0) {
-      startIndexArr[startIndex] = new ViterbiNode[10];
-    }
-    
-    if (endNodesCount == 0) {
-      endIndexArr[endIndex] = new ViterbiNode[10];
-    }
-    
-    if (startIndexArr[startIndex].length <= startNodesCount){
-      startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
-    }
-    
-    if (endIndexArr[endIndex].length <= endNodesCount){
-      endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
-    }
-    
-    startIndexArr[startIndex][startNodesCount] = node;
-    endIndexArr[endIndex][endNodesCount] = node;
-    
-    startSizeArr[startIndex] = startNodesCount + 1;
-    endSizeArr[endIndex] = endNodesCount + 1;
-  }
-  
-  
-  /**
-   * Return twice as big array which contains value of input array
-   * @param array
-   * @return
-   */
-  private ViterbiNode[] extendArray(ViterbiNode[] array) {
-    //extend array
-    ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
-    System.arraycopy(array, 0, newArray, 0, array.length);
-    return newArray;
-  }
-}
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java	(working copy)
@@ -1,147 +0,0 @@
-package org.apache.lucene.analysis.kuromoji.viterbi;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-public final class ViterbiNode {
-  public enum Type {
-    KNOWN,
-    UNKNOWN,
-    USER
-  }
-  
-  private final int wordId;
-  
-  private final char[] surfaceForm;
-  private final int offset;
-  private final int length;
-  
-  private final int leftId;
-  
-  private final int rightId;
-  
-  /** word cost for this node */
-  private final int wordCost;
-  
-  /** minimum path cost found thus far */
-  private int pathCost;
-  
-  private ViterbiNode leftNode;
-  
-  private final Type type;
-  
-  private final int startIndex;
-  
-  public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
-    this.wordId = wordId;
-    this.surfaceForm = surfaceForm;
-    this.offset = offset;
-    this.length = length;
-    this.leftId = leftId;
-    this.rightId = rightId;
-    this.wordCost = wordCost;
-    this.startIndex = startIndex;
-    this.type = type;
-  }
-  
-  
-  /**
-   * @return the wordId
-   */
-  public int getWordId() {
-    return wordId;
-  }
-  
-  /**
-   * @return the surfaceForm
-   */
-  public char[] getSurfaceForm() {
-    return surfaceForm;
-  }
-  
-  /**
-   * @return start offset into surfaceForm
-   */
-  public int getOffset() {
-    return offset;
-  }
-  
-  /**
-   * @return length of surfaceForm
-   */
-  public int getLength() {
-    return length;
-  }
-  
-  /**
-   * @return the surfaceForm as a String
-   */
-  public String getSurfaceFormString() {
-    return new String(surfaceForm, offset, length);
-  }
-  
-  /**
-   * @return the leftId
-   */
-  public int getLeftId() {
-    return leftId;
-  }
-  
-  /**
-   * @return the rightId
-   */
-  public int getRightId() {
-    return rightId;
-  }
-  
-  /**
-   * @return the cost
-   */
-  public int getWordCost() {
-    return wordCost;
-  }
-  
-  /**
-   * @return the cost
-   */
-  public int getPathCost() {
-    return pathCost;
-  }
-  
-  /**
-   * param cost minimum path cost found this far
-   */
-  public void setPathCost(int pathCost) {
-    this.pathCost = pathCost;
-  }
-  
-  public void setLeftNode(ViterbiNode node) {
-    leftNode = node;
-  }
-  
-  public ViterbiNode getLeftNode() {
-    return leftNode;
-  }
-  
-  public int getStartIndex() {
-    return startIndex;
-  }
-  
-  public Type getType() {
-    return type;
-  }
-}
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java	(working copy)
@@ -1,226 +0,0 @@
-package org.apache.lucene.analysis.kuromoji.viterbi;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
-
-public class GraphvizFormatter {
-  
-  private final static String BOS_LABEL = "BOS";
-  
-  private final static String EOS_LABEL = "EOS";
-  
-  private final static String FONT_NAME = "Helvetica";
-  
-  private ConnectionCosts costs;
-  
-  private Map<String, ViterbiNode> nodeMap;
-  
-  private Map<String, String> bestPathMap;
-  
-  private boolean foundBOS;
-  
-  public GraphvizFormatter(ConnectionCosts costs) {
-    this.costs = costs;
-    this.nodeMap = new HashMap<String, ViterbiNode>();
-    this.bestPathMap = new HashMap<String, String>();
-  }
-  
-  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
-    initBestPathMap(null);
-    
-    StringBuilder sb = new StringBuilder();
-    sb.append(formatHeader());
-    sb.append(formatNodes(startsArray, endsArray));
-    sb.append(formatTrailer());
-    return sb.toString();
-  }
-  
-  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
-    
-    //		List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
-    initBestPathMap(bestPath);
-    
-    StringBuilder sb = new StringBuilder();
-    sb.append(formatHeader());
-    sb.append(formatNodes(startsArray, endsArray));
-    sb.append(formatTrailer());
-    return sb.toString();
-    
-  }
-  
-  private void initBestPathMap(List<ViterbiNode> bestPath) {
-    this.bestPathMap.clear();
-    
-    if (bestPath == null){
-      return;
-    }
-    for (int i = 0; i < bestPath.size() - 1; i++) {
-      ViterbiNode from = bestPath.get(i);
-      ViterbiNode to = bestPath.get(i + 1);
-      
-      String fromId = getNodeId(from);
-      String toId = getNodeId(to);
-      
-      assert this.bestPathMap.containsKey(fromId) == false;
-      assert this.bestPathMap.containsValue(toId) == false;
-      this.bestPathMap.put(fromId, toId);
-    }
-  }
-  
-  private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
-    this.nodeMap.clear();
-    this.foundBOS = false;
-    
-    StringBuilder sb = new StringBuilder();
-    for (int i = 1; i < endsArray.length; i++) {
-      if(endsArray[i] == null || startsArray[i] == null) {
-        continue;
-      }
-      for (int j = 0; j < endsArray[i].length; j++) {
-        ViterbiNode from = endsArray[i][j];
-        if(from == null){
-          continue;
-        }
-        sb.append(formatNodeIfNew(from));
-        for (int k = 0; k < startsArray[i].length; k++) {
-          ViterbiNode to = startsArray[i][k];
-          if(to == null){
-            break;
-          }
-          sb.append(formatNodeIfNew(to));
-          sb.append(formatEdge(from, to));
-        }
-      }
-    }
-    return sb.toString();
-  }
-  
-  private String formatNodeIfNew(ViterbiNode node) {
-    String nodeId = getNodeId(node);
-    if (! this.nodeMap.containsKey(nodeId)) {
-      this.nodeMap.put(nodeId, node);
-      return formatNode(node);
-    } else {
-      return "";
-    }
-  }	
-  
-  private String formatHeader() {
-    StringBuilder sb = new StringBuilder();
-    sb.append("digraph viterbi {\n");
-    sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
-    sb.append("# A2 paper size\n");
-    sb.append("size = \"34.4,16.5\";\n");
-    sb.append("# try to fill paper\n");
-    sb.append("ratio = fill;\n");
-    sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
-    sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
-    
-    return sb.toString();
-  }
-  
-  private String formatTrailer() {
-    return "}";
-  }
-  
-  
-  private String formatEdge(ViterbiNode from, ViterbiNode to) {
-    if (this.bestPathMap.containsKey(getNodeId(from)) &&
-        this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
-      return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
-      
-    } else {
-      return formatEdge(from, to, "");
-    }
-  }
-  
-  
-  private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(getNodeId(from));
-    sb.append(" -> ");
-    sb.append(getNodeId(to));
-    sb.append(" [ ");
-    sb.append("label=\"");
-    sb.append(getCost(from, to));
-    sb.append("\"");
-    sb.append(" ");
-    sb.append(attributes);
-    sb.append(" ");
-    sb.append(" ]");
-    sb.append("\n");
-    return sb.toString();
-  }
-  
-  private String formatNode(ViterbiNode node) {
-    StringBuilder sb = new StringBuilder();
-    sb.append("\"");
-    sb.append(getNodeId(node));
-    sb.append("\"");
-    sb.append(" [ ");
-    sb.append("label=");
-    sb.append(formatNodeLabel(node));
-    sb.append(" ]");
-    return sb.toString();
-  }
-  
-  private String formatNodeLabel(ViterbiNode node) {
-    StringBuilder sb = new StringBuilder();
-    sb.append("<<table border=\"0\" cellborder=\"0\">");
-    sb.append("<tr><td>");
-    sb.append(getNodeLabel(node));
-    sb.append("</td></tr>");
-    sb.append("<tr><td>");
-    sb.append("<font color=\"blue\">");
-    sb.append(node.getWordCost());
-    sb.append("</font>");
-    sb.append("</td></tr>");
-    //		sb.append("<tr><td>");
-    //		sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
-    //		sb.append("</td></tr>");
-    sb.append("</table>>");
-    return sb.toString();
-  }
-  
-  private String getNodeId(ViterbiNode node) {
-    return String.valueOf(node.hashCode());
-  }
-  
-  private String getNodeLabel(ViterbiNode node) {
-    if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
-      if (this.foundBOS) {
-        return EOS_LABEL;
-      } else {
-        this.foundBOS = true;
-        return BOS_LABEL;
-      }
-    } else {
-      return node.getSurfaceFormString();
-    }
-  }
-  
-  private int getCost(ViterbiNode from, ViterbiNode to) {
-    return this.costs.get(from.getLeftId(), to.getRightId());
-  }
-}
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java	(working copy)
@@ -1,214 +0,0 @@
-package org.apache.lucene.analysis.kuromoji;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.EnumMap;
-import java.util.List;
-
-import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
-import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
-import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
-import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
-
-/**
- * Tokenizer main class.
- * Thread safe.
- */
-public class Segmenter {
-  public static enum Mode {
-    NORMAL, SEARCH, EXTENDED
-  }
-  
-  public static final Mode DEFAULT_MODE = Mode.SEARCH;
-  
-  private final Viterbi viterbi;
-  
-  private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
-  
-  private final boolean split;
-  
-  public Segmenter() {
-    this(null, DEFAULT_MODE, false);
-  }
-
-  public Segmenter(Mode mode) {
-    this(null, mode, false);
-  }
-
-  public Segmenter(UserDictionary userDictionary) {
-    this(userDictionary, DEFAULT_MODE, false);
-  }
-
-  public Segmenter(UserDictionary userDictionary, Mode mode) {
-    this(userDictionary, mode, false);
-  }
-
-  public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
-    final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
-    final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
-    this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
-    this.split = split;
-    
-    dictionaryMap.put(Type.KNOWN, dict);
-    dictionaryMap.put(Type.UNKNOWN, unknownDict);
-    dictionaryMap.put(Type.USER, userDictionary);
-  }
-  
-  /**
-   * Tokenize input text
-   * @param text
-   * @return list of Token
-   */
-  public List<Token> tokenize(String text) {
-    
-    if (!split) {
-      return doTokenize(0, text);			
-    }
-    
-    List<Integer> splitPositions = getSplitPositions(text);
-    
-    if(splitPositions.size() == 0) {
-      return doTokenize(0, text);
-    }
-    
-    ArrayList<Token> result = new ArrayList<Token>();
-    int offset = 0;
-    for(int position : splitPositions) {
-      result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
-      offset = position + 1;
-    }
-    
-    if(offset < text.length()) {
-      result.addAll(doTokenize(offset, text.substring(offset)));
-    }
-    
-    return result;
-  }
-  
-  /**
-   * Split input text at 句読点, which is 。 and 、
-   * @param text
-   * @return list of split position
-   */
-  private List<Integer> getSplitPositions(String text) {
-    ArrayList<Integer> splitPositions = new ArrayList<Integer>();
-    
-    int position = 0;
-    int currentPosition = 0;
-    
-    while(true) {
-      int indexOfMaru = text.indexOf("。", currentPosition);
-      int indexOfTen = text.indexOf("、", currentPosition);
-      
-      if(indexOfMaru < 0 || indexOfTen < 0) {
-        position = Math.max(indexOfMaru, indexOfTen);;
-      } else {
-        position = Math.min(indexOfMaru, indexOfTen);				
-      }
-      
-      if(position >= 0) {
-        splitPositions.add(position);
-        currentPosition = position + 1;
-      } else {
-        break;
-      }
-    }
-    
-    return splitPositions;
-  }
-  
-  private List<Token> doTokenize(int offset, String sentence) {
-    char text[] = sentence.toCharArray();
-    return doTokenize(offset, text, 0, text.length, false);
-  }
-  
-  /**
-   * Tokenize input sentence.
-   * @param offset offset of sentence in original input text
-   * @param sentence sentence to tokenize
-   * @return list of Token
-   */
-  public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
-    ArrayList<Token> result = new ArrayList<Token>();
-    
-    ViterbiNode[][][] lattice;
-    try {
-      lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
-    } catch (IOException impossible) {
-      throw new RuntimeException(impossible);
-    }
-    List<ViterbiNode> bestPath = viterbi.search(lattice);
-    for (ViterbiNode node : bestPath) {
-      int wordId = node.getWordId();
-      if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS 
-        continue;
-      } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
-        continue; // Do not emit punctuation
-      }
-      Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
-      result.add(token);
-    }
-    
-    return result;
-  }
-  
-  /** returns a Graphviz String */
-  public String debugTokenize(String text) {
-    ViterbiNode[][][] lattice;
-    try {
-      lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
-    } catch (IOException impossible) {
-      throw new RuntimeException(impossible);
-    }
-    List<ViterbiNode> bestPath = this.viterbi.search(lattice);
-    
-    return new GraphvizFormatter(ConnectionCosts.getInstance())
-      .format(lattice[0], lattice[1], bestPath);
-  }
-  
-  static final boolean isPunctuation(char ch) {
-    switch(Character.getType(ch)) {
-      case Character.SPACE_SEPARATOR:
-      case Character.LINE_SEPARATOR:
-      case Character.PARAGRAPH_SEPARATOR:
-      case Character.CONTROL:
-      case Character.FORMAT:
-      case Character.DASH_PUNCTUATION:
-      case Character.START_PUNCTUATION:
-      case Character.END_PUNCTUATION:
-      case Character.CONNECTOR_PUNCTUATION:
-      case Character.OTHER_PUNCTUATION:
-      case Character.MATH_SYMBOL:
-      case Character.CURRENCY_SYMBOL:
-      case Character.MODIFIER_SYMBOL:
-      case Character.OTHER_SYMBOL:
-      case Character.INITIAL_QUOTE_PUNCTUATION:
-      case Character.FINAL_QUOTE_PUNCTUATION:
-        return true;
-      default:
-        return false;
-    }
-  }
-}
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java	(revision 1294102)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java	(working copy)
@@ -17,67 +17,1120 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
-import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumMap;
 import java.util.List;
-import java.util.Locale;
 
-import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
-import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
-import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
-import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
+import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.RollingCharBuffer;
+import org.apache.lucene.util.fst.FST;
 
-public final class KuromojiTokenizer extends SegmentingTokenizerBase {
-  private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
+// TODO: somehow factor out a reusable viterbi search here,
+// so other decompounders/tokenizers can reuse...
+
+/* Uses a rolling Viterbi search to find the least cost
+ * segmentation (path) of the incoming characters.  For
+ * tokens that appear to be compound (> length 2 for all
+ * Kanji, or > length 7 for non-Kanji), we see if there is a
+ * 2nd best segmentation of that token after applying
+ * penalties to the long tokens.  If so, and the Mode is
+ * SEARCH_WITH_COMPOUND, we output the alternate
+ * segmentation as well. */
+public final class KuromojiTokenizer extends Tokenizer {
+
+  public static enum Mode {
+    NORMAL, SEARCH, EXTENDED
+  }
+
+  public static final Mode DEFAULT_MODE = Mode.SEARCH;
+
+  enum Type {
+    KNOWN,
+    UNKNOWN,
+    USER
+  }
+
+  private static final boolean VERBOSE = false;
+
+  private static final int SEARCH_MODE_KANJI_LENGTH = 2;
+
+  private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+  private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+  private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
+
+  private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+
+  private final TokenInfoFST fst;
+  private final TokenInfoDictionary dictionary;
+  private final UnknownDictionary unkDictionary;
+  private final ConnectionCosts costs;
+  private final UserDictionary userDictionary;
+  private final CharacterDefinition characterDefinition;
+
+  private final FST.Arc<Long> arc = new FST.Arc<Long>();
+  private final FST.BytesReader fstReader;
+  private final IntsRef wordIdRef = new IntsRef();
+
+  private final FST.BytesReader userFSTReader;
+  private final TokenInfoFST userFST;
+
+  private final RollingCharBuffer buffer = new RollingCharBuffer();
+
+  private final WrappedPositionArray positions = new WrappedPositionArray();
+
+  private final boolean discardPunctuation;
+  private final boolean searchMode;
+  private final boolean extendedMode;
+  private final boolean outputCompounds;
+
+  // Index of the last character of unknown word:
+  private int unknownWordEndIndex = -1;
+
+  // True once we've hit the EOF from the input reader:
+  private boolean end;
+
+  // Last absolute position we backtraced from:
+  private int lastBackTracePos;
+
+  // Position of last token we returned; we use this to
+  // figure out whether to set posIncr to 0 or 1:
+  private int lastTokenPos;
+
+  // Next absolute position to process:
+  private int pos;
+
+  // Already parsed, but not yet passed to caller, tokens:
+  private final List<Token> pending = new ArrayList<Token>();
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
   private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
   private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
   private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
-  private final Segmenter segmenter;
-  
-  private List<Token> tokens; 
-  private int tokenIndex = 0;
-  private int sentenceStart = 0;
-  
-  public KuromojiTokenizer(Reader input) {
-    this(new Segmenter(), input);
+
+  public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
+    super(input);
+    dictionary = TokenInfoDictionary.getInstance();
+    fst = dictionary.getFST();
+    unkDictionary = UnknownDictionary.getInstance();
+    characterDefinition = unkDictionary.getCharacterDefinition();
+    this.userDictionary = userDictionary;
+    costs = ConnectionCosts.getInstance();
+    fstReader = fst.getBytesReader(0);
+    if (userDictionary != null) {
+      userFST = userDictionary.getFST();
+      userFSTReader = userFST.getBytesReader(0);
+    } else {
+      userFST = null;
+      userFSTReader = null;
+    }
+    this.discardPunctuation = discardPunctuation;
+    switch(mode){
+      case SEARCH:
+        searchMode = true;
+        extendedMode = false;
+        outputCompounds = true;
+        break;
+      case EXTENDED:
+        searchMode = true;
+        extendedMode = true;
+        outputCompounds = false;
+        break;
+      default:
+        searchMode = false;
+        extendedMode = false;
+        outputCompounds = false;
+        break;
+    }
+    buffer.reset(input);
+
+    resetState();
+
+    dictionaryMap.put(Type.KNOWN, dictionary);
+    dictionaryMap.put(Type.UNKNOWN, unkDictionary);
+    dictionaryMap.put(Type.USER, userDictionary);
   }
-  
-  public KuromojiTokenizer(Segmenter segmenter, Reader input) {
-    super(input, (BreakIterator) proto.clone());
-    this.segmenter = segmenter;
+
+  private GraphvizFormatter dotOut;
+
+  /** Expert: set this to produce graphviz (dot) output of
+   *  the Viterbi lattice */
+  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
+    this.dotOut = dotOut;
   }
-  
+
   @Override
-  protected void setNextSentence(int sentenceStart, int sentenceEnd) {
-    this.sentenceStart = sentenceStart;
-    // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
-    tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
-    tokenIndex = 0;
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    buffer.reset(input);
   }
 
   @Override
-  protected boolean incrementWord() {
-    if (tokenIndex == tokens.size()) {
-      return false;
+  public void reset() throws IOException {
+    super.reset();
+    resetState();
+  }
+
+  private void resetState() {
+    positions.reset();
+    unknownWordEndIndex = -1;
+    pos = 0;
+    end = false;
+    lastBackTracePos = 0;
+    lastTokenPos = -1;
+    pending.clear();
+
+    // Add BOS:
+    positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
+  }
+
+  @Override
+  public void end() {
+    // Set final offset
+    offsetAtt.setOffset(correctOffset(pos), correctOffset(pos));
+  }
+
+  // Returns the added cost that a 2nd best segmentation is
+  // allowed to have.  Ie, if we see path with cost X,
+  // ending in a compound word, and this method returns
+  // threshold > 0, then we will also find the 2nd best
+  // segmentation and if its path score is within this
+  // threshold of X, we'll include it in the output:
+  private int computeSecondBestThreshold(int pos, int length) throws IOException {
+    // TODO: maybe we do something else here, instead of just
+    // using the penalty...?  EG we can be more aggressive on
+    // when to also test for 2nd best path
+    return computePenalty(pos, length);
+  }
+
+  private int computePenalty(int pos, int length) throws IOException {
+    if (length > SEARCH_MODE_KANJI_LENGTH) {
+      boolean allKanji = true;
+      // check if node consists of only kanji
+      final int endPos = pos + length;
+      for (int pos2 = pos; pos2 < endPos; pos2++) {
+        if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
+          allKanji = false;
+          break;
+        }				
+      }
+      if (allKanji) {	// Process only Kanji keywords
+        return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+      } else if (length > SEARCH_MODE_OTHER_LENGTH) {
+        return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;								
+      }
     }
-    Token token = tokens.get(tokenIndex);
+    return 0;
+  }
+
+  // Holds all back pointers arriving to this position:
+  final static class Position {
+
+    int pos;
+
+    int count;
+
+    // maybe single int array * 5?
+    int[] costs = new int[8];
+    int[] lastRightID = new int[8];
+    int[] backPos = new int[8];
+    int[] backIndex = new int[8];
+    int[] backID = new int[8];
+    Type[] backType = new Type[8];
+
+    // Only used when finding 2nd best segmentation under a
+    // too-long token:
+    int forwardCount;
+    int[] forwardPos = new int[8];
+    int[] forwardID = new int[8];
+    int[] forwardIndex = new int[8];
+    Type[] forwardType = new Type[8];
+
+    public void grow() {
+      costs = ArrayUtil.grow(costs, 1+count);
+      lastRightID = ArrayUtil.grow(lastRightID, 1+count);
+      backPos = ArrayUtil.grow(backPos, 1+count);
+      backIndex = ArrayUtil.grow(backIndex, 1+count);
+      backID = ArrayUtil.grow(backID, 1+count);
+
+      // NOTE: sneaky: grow separately because
+      // ArrayUtil.grow will otherwise pick a different
+      // length than the int[]s we just grew:
+      final Type[] newBackType = new Type[backID.length];
+      System.arraycopy(backType, 0, newBackType, 0, backType.length);
+      backType = newBackType;
+    }
+
+    public void growForward() {
+      forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount);
+      forwardID = ArrayUtil.grow(forwardID, 1+forwardCount);
+      forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount);
+
+      // NOTE: sneaky: grow separately because
+      // ArrayUtil.grow will otherwise pick a different
+      // length than the int[]s we just grew:
+      final Type[] newForwardType = new Type[forwardPos.length];
+      System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length);
+      forwardType = newForwardType;
+    }
+
+    public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
+      // NOTE: this isn't quite a true Viterbit search,
+      // becase we should check if lastRightID is
+      // already present here, and only update if the new
+      // cost is less than the current cost, instead of
+      // simply appending.  However, that will likely hurt
+      // performance (usually we add a lastRightID only once),
+      // and it means we actually create the full graph
+      // intersection instead of a "normal" Viterbi lattice:
+      if (count == costs.length) {
+        grow();
+      }
+      this.costs[count] = cost;
+      this.lastRightID[count] = lastRightID;
+      this.backPos[count] = backPos;
+      this.backIndex[count] = backIndex;
+      this.backID[count] = backID;
+      this.backType[count] = backType;
+      count++;
+    }
+
+    public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) {
+      if (forwardCount == this.forwardID.length) {
+        growForward();
+      }
+      this.forwardPos[forwardCount] = forwardPos;
+      this.forwardIndex[forwardCount] = forwardIndex;
+      this.forwardID[forwardCount] = forwardID;
+      this.forwardType[forwardCount] = forwardType;
+      forwardCount++;
+    }
+
+    public void reset() {
+      count = 0;
+      // forwardCount naturally resets after it runs:
+      assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount;
+    }
+  }
+
+  private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException {
+    final int wordCost = dict.getWordCost(wordID);
+    final int leftID = dict.getLeftId(wordID);
+    int leastCost = Integer.MAX_VALUE;
+    int leastIDX = -1;
+    assert fromPosData.count > 0;
+    for(int idx=0;idx<fromPosData.count;idx++) {
+      // Cost is path cost so far, plus word cost (added at
+      // end of loop), plus bigram cost:
+      final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID);
+      if (VERBOSE) {
+        System.out.println("      fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.get(fromPosData.lastRightID[idx], leftID) + " leftID=" + leftID);
+      }
+      if (cost < leastCost) {
+        leastCost = cost;
+        leastIDX = idx;
+        //System.out.println("        **");
+      }
+    }
+
+    leastCost += wordCost;
+
+    if (VERBOSE) {
+      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos.idx=" + positions.get(endPos).count);
+    }
+
+    if ((addPenalty || (!outputCompounds && searchMode)) && type != Type.USER) {
+      final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
+      if (VERBOSE) {
+        if (penalty > 0) {
+          System.out.println("        + penalty=" + penalty + " cost=" + (leastCost+penalty));
+        }
+      }
+      leastCost += penalty;
+    }
+
+    //positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, wordID, type);
+    assert leftID == dict.getRightId(wordID);
+    positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    // parse() is able to return w/o producing any new
+    // tokens, when the tokens it had produced were entirely
+    // punctuation.  So we loop here until we get a real
+    // token or we end:
+    while (pending.size() == 0) {
+      if (end) {
+        return false;
+      }
+
+      // Push Viterbi forward some more:
+      parse();
+    }
+
+    final Token token = pending.remove(pending.size()-1);
+
     int position = token.getPosition();
     int length = token.getLength();
     clearAttributes();
-    termAtt.copyBuffer(buffer, sentenceStart + position, length);
-    int startOffset = offset + sentenceStart + position;
-    offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
+    assert length > 0;
+    //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length);
+    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
+    offsetAtt.setOffset(correctOffset(position), correctOffset(position+length));
     basicFormAtt.setToken(token);
     posAtt.setToken(token);
     readingAtt.setToken(token);
     inflectionAtt.setToken(token);
-    tokenIndex++;
+    if (token.getPosition() == lastTokenPos) {
+      posIncAtt.setPositionIncrement(0);
+      posLengthAtt.setPositionLength(token.getPositionLength());
+    } else {
+      assert token.getPosition() > lastTokenPos;
+      posIncAtt.setPositionIncrement(1);
+      posLengthAtt.setPositionLength(1);
+    }
+    if (VERBOSE) {
+      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
+    }
+    lastTokenPos = token.getPosition();
     return true;
   }
+
+  // TODO: make generic'd version of this "circular array"?
+  // It's a bit tricky because we do things to the Position
+  // (eg, set .pos = N on reuse)...
+  static final class WrappedPositionArray {
+    private Position[] positions = new Position[8];
+
+    public WrappedPositionArray() {
+      for(int i=0;i<positions.length;i++) {
+        positions[i] = new Position();
+      }
+    }
+
+    // Next array index to write to in positions:
+    private int nextWrite;
+
+    // Next position to write:
+    private int nextPos;
+    
+    // How many valid Position instances are held in the
+    // positions array:
+    private int count;
+
+    public void reset() {
+      nextWrite--;
+      while(count > 0) {
+        if (nextWrite == -1) {
+          nextWrite = positions.length - 1;
+        }
+        positions[nextWrite--].reset();
+        count--;
+      }
+      nextWrite = 0;
+      nextPos = 0;
+      count = 0;
+    }
+
+    /** Get Position instance for this absolute position;
+     *  this is allowed to be arbitrarily far "in the
+     *  future" but cannot be before the last freeBefore. */
+    public Position get(int pos) {
+      while(pos >= nextPos) {
+        //System.out.println("count=" + count + " vs len=" + positions.length);
+        if (count == positions.length) {
+          Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          //System.out.println("grow positions " + newPositions.length);
+          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
+          System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
+          for(int i=positions.length;i<newPositions.length;i++) {
+            newPositions[i] = new Position();
+          }
+          nextWrite = positions.length;
+          positions = newPositions;
+        }
+        if (nextWrite == positions.length) {
+          nextWrite = 0;
+        }
+        // Should have already been reset:
+        assert positions[nextWrite].count == 0;
+        positions[nextWrite++].pos = nextPos++;
+        count++;
+      }
+      assert inBounds(pos);
+      final int index = getIndex(pos);
+      assert positions[index].pos == pos;
+      return positions[index];
+    }
+
+    public int getNextPos() {
+      return nextPos;
+    }
+
+    // For assert:
+    private boolean inBounds(int pos) {
+      return pos < nextPos && pos >= nextPos - count;
+    }
+
+    private int getIndex(int pos) {
+      int index = nextWrite - (nextPos - pos);
+      if (index < 0) {
+        index += positions.length;
+      }
+      return index;
+    }
+
+    public void freeBefore(int pos) {
+      final int toFree = count - (nextPos - pos);
+      assert toFree >= 0;
+      assert toFree <= count;
+      int index = nextWrite - count;
+      if (index < 0) {
+        index += positions.length;
+      }
+      for(int i=0;i<toFree;i++) {
+        if (index == positions.length) {
+          index = 0;
+        }
+        //System.out.println("  fb idx=" + index);
+        positions[index].reset();
+        index++;
+      }
+      count -= toFree;
+    }
+  }
+
+  /* Incrementally parse some more characters.  This runs
+   * the viterbi search forwards "enough" so that we
+   * generate some more tokens.  How much forward depends on
+   * the chars coming in, since some chars could cause
+   * longer-lasting ambiguity in the parsing.  Once the
+   * ambiguity is resolved, then we back trace, produce
+   * the pending tokens, and return. */
+  private void parse() throws IOException {
+    if (VERBOSE) {
+      System.out.println("\nPARSE");
+    }
+
+    // Advances over each position (character):
+    while (true) {
+
+      if (buffer.get(pos) == -1) {
+        // End
+        break;
+      }
+
+      final Position posData = positions.get(pos);
+      final boolean isFrontier = positions.getNextPos() == pos+1;
+
+      if (posData.count == 0) {
+        // No arcs arrive here; move to next position:
+        pos++;
+        if (VERBOSE) {
+          System.out.println("    no arcs in; skip");
+        }
+        continue;
+      }
+
+      if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+        //  if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+        // We are at a "frontier", and only one node is
+        // alive, so whatever the eventual best path is must
+        // come through this node.  So we can safely commit
+        // to the prefix of the best path at this point:
+        backtrace(posData, 0);
+
+        // Re-base cost so we don't risk int overflow:
+        posData.costs[0] = 0;
+
+        if (pending.size() != 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (pos - lastBackTracePos >= 2048) {
+        // Safety: if we've buffered too much, force a
+        // backtrace now:
+        int leastIDX = -1;
+        int leastCost = Integer.MAX_VALUE;
+        for(int idx=0;idx<posData.count;idx++) {
+          //System.out.println("    idx=" + idx + " cost=" + cost);
+          final int cost = posData.costs[idx];
+          if (cost < leastCost) {
+            leastCost = cost;
+            leastIDX = idx;
+          }
+        }
+        backtrace(posData, leastIDX);
+
+        // Re-base cost so we don't risk int overflow:
+        Arrays.fill(posData.costs, 0, posData.count, 0);
+
+        if (pending.size() != 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("\n  extend @ pos=" + pos + " char=" + (char) buffer.get(pos));
+      }
+
+      if (VERBOSE) {
+        System.out.println("    " + posData.count + " arcs in");
+      }
+
+      boolean anyMatches = false;
+
+      // First try user dict:
+      if (userFST != null) {
+        userFST.getFirstArc(arc);
+        int output = 0;
+        for(int posAhead=posData.pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          if (userFST.findTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) {
+            break;
+          }
+          output += arc.output.intValue();
+          if (arc.isFinal()) {
+            if (VERBOSE) {
+              System.out.println("    USER word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
+            }
+            add(userDictionary, posData, posAhead+1, output + arc.nextFinalOutput.intValue(), Type.USER, false);
+            anyMatches = true;
+          }
+        }
+      }
+
+      // TODO: we can be more aggressive about user
+      // matches?  if we are "under" a user match then don't
+      // extend KNOWN/UNKNOWN paths?
+
+      if (!anyMatches) {
+        // Next, try known dictionary matches
+        fst.getFirstArc(arc);
+        int output = 0;
+
+        for(int posAhead=posData.pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);
+          
+          if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) {
+            break;
+          }
+
+          output += arc.output.intValue();
+
+          // Optimization: for known words that are too-long
+          // (compound), we should pre-compute the 2nd
+          // best segmentation and store it in the
+          // dictionary instead of recomputing it each time a
+          // match is found.
+
+          if (arc.isFinal()) {
+            dictionary.lookupWordIds(output + arc.nextFinalOutput.intValue(), wordIdRef);
+            if (VERBOSE) {
+              System.out.println("    KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
+            }
+            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+              add(dictionary, posData, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN, false);
+              anyMatches = true;
+            }
+          }
+        }
+      }
+
+      // In the case of normal mode, it doesn't process unknown word greedily.
+
+      if (!searchMode && unknownWordEndIndex > posData.pos) {
+        pos++;
+        continue;
+      }
+
+      final char firstCharacter = (char) buffer.get(pos);
+      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
+
+        // Find unknown match:
+        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+
+        // NOTE: copied from UnknownDictionary.lookup:
+        int unknownWordLength;
+        if (!characterDefinition.isGroup(firstCharacter)) {
+          unknownWordLength = 1;
+        } else {
+          // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+          unknownWordLength = 1;
+          for (int posAhead=pos+1;;posAhead++) {
+            final int ch = buffer.get(posAhead);
+            if (ch == -1) {
+              break;
+            }
+            if (characterId == characterDefinition.getCharacterClass((char) ch)) {
+              unknownWordLength++;    			
+            } else {
+              break;
+            }
+          }
+        }
+
+        unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+        if (VERBOSE) {
+          System.out.println("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
+        }
+        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+          add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN, false);
+        }
+
+        unknownWordEndIndex = posData.pos + unknownWordLength;
+      }
+
+      pos++;
+    }
+
+    end = true;
+
+    if (pos > 0) {
+
+      final Position endPosData = positions.get(pos);
+      int leastCost = Integer.MAX_VALUE;
+      int leastIDX = -1;
+      if (VERBOSE) {
+        System.out.println("  end: " + endPosData.count + " nodes");
+      }
+      for(int idx=0;idx<endPosData.count;idx++) {
+        // Add EOS cost:
+        final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
+        //System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
+        if (cost < leastCost) {
+          leastCost = cost;
+          leastIDX = idx;
+        }
+      }
+
+      backtrace(endPosData, leastIDX);
+    } else {
+      // No characters in the input string; return no tokens!
+    }
+  }
+
+  // Eliminates arcs from the lattice that are compound
+  // tokens (have a penalty) or are not congruent with the
+  // compound token we've matched (ie, span across the
+  // startPos).  This should be fairly efficient, because we
+  // just keep the already intersected structure of the
+  // graph, eg we don't have to consult the FSTs again:
+
+  private void pruneAndRescore(int startPos, int endPos, int bestStartIDX) throws IOException {
+    if (VERBOSE) {
+      System.out.println("  pruneAndRescore startPos=" + startPos + " endPos=" + endPos + " bestStartIDX=" + bestStartIDX);
+    }
+
+    // First pass: walk backwards, building up the forward
+    // arcs and pruning inadmissible arcs:
+    for(int pos=endPos; pos >= startPos; pos--) {
+      final Position posData = positions.get(pos);
+      if (VERBOSE) {
+        System.out.println("    back pos=" + pos);
+      }
+      for(int arcIDX=0;arcIDX<posData.count;arcIDX++) {
+        final int backPos = posData.backPos[arcIDX];
+        if (backPos >= startPos) {
+          // Keep this arc:
+          //System.out.println("      keep backPos=" + backPos);
+          positions.get(backPos).addForward(pos,
+                                            arcIDX,
+                                            posData.backID[arcIDX],
+                                            posData.backType[arcIDX]);
+        } else {
+          if (VERBOSE) {
+            System.out.println("      prune");
+          }
+        }
+      }
+      if (pos != startPos) {
+        posData.count = 0;
+      }
+    }
+
+    // Second pass: walk forward, re-scoring:
+    for(int pos=startPos; pos < endPos; pos++) {
+      final Position posData = positions.get(pos);
+      if (VERBOSE) {
+        System.out.println("    forward pos=" + pos + " count=" + posData.forwardCount);
+      }
+      if (posData.count == 0) {
+        // No arcs arrive here...
+        if (VERBOSE) {
+          System.out.println("      skip");
+        }
+        posData.forwardCount = 0;
+        continue;
+      }
+
+      if (pos == startPos) {
+        // On the initial position, only consider the best
+        // path so we "force congruence":  the
+        // sub-segmentation is "in context" of what the best
+        // path (compound token) had matched:
+        final int rightID;
+        if (startPos == 0) {
+          rightID = 0;
+        } else {
+          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
+        }
+        final int pathCost = posData.costs[bestStartIDX];
+        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
+          final Type forwardType = posData.forwardType[forwardArcIDX];
+          final Dictionary dict2 = getDict(forwardType);
+          final int wordID = posData.forwardID[forwardArcIDX];
+          final int toPos = posData.forwardPos[forwardArcIDX];
+          final int newCost = pathCost + dict2.getWordCost(wordID) + 
+            costs.get(rightID, dict2.getLeftId(wordID)) +
+            computePenalty(pos, toPos-pos);
+          if (VERBOSE) {
+            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
+          }
+          positions.get(toPos).add(newCost,
+                                   dict2.getRightId(wordID),
+                                   pos,
+                                   bestStartIDX,
+                                   wordID,
+                                   forwardType);
+        }
+      } else {
+        // On non-initial positions, we maximize score
+        // across all arriving lastRightIDs:
+        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
+          final Type forwardType = posData.forwardType[forwardArcIDX];
+          final int toPos = posData.forwardPos[forwardArcIDX];
+          if (VERBOSE) {
+            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos);
+          }
+          add(getDict(forwardType),
+              posData,
+              toPos,
+              posData.forwardID[forwardArcIDX],
+              forwardType,
+              true);
+        }
+      }
+      posData.forwardCount = 0;
+    }
+  }
+
+  // Backtrace from the provided position, back to the last
+  // time we back-traced, accumulating the resulting tokens to
+  // the pending list.  The pending list is then in-reverse
+  // (last token should be returned first).
+  private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
+    if (VERBOSE) {
+      System.out.println("\n  backtrace: pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
+    }
+    final int endPos = endPosData.pos;
+
+    final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);
+
+    if (dotOut != null) {
+      dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+    }
+
+    int pos = endPos;
+    int bestIDX = fromIDX;
+    Token altToken = null;
+
+    // We trace backwards, so this will be the leftWordID of
+    // the token after the one we are now on:
+    int lastLeftWordID = -1;
+
+    int backCount = 0;
+
+    // TODO: sort of silly to make Token instances here; the
+    // back trace has all info needed to generate the
+    // token.  So, we could just directly set the attrs,
+    // from the backtrace, in incrementToken w/o ever
+    // creating Token; we'd have to defer calling freeBefore
+    // until after the bactrace was fully "consumed" by
+    // incrementToken.
+
+    while (pos > lastBackTracePos) {
+      //System.out.println("back pos=" + pos);
+      final Position posData = positions.get(pos);
+
+      int backPos = posData.backPos[bestIDX];
+      int length = pos - backPos;
+      Type backType = posData.backType[bestIDX];
+      int backID = posData.backID[bestIDX];
+
+      if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
+        
+        // In searchMode, if best path had picked a too-long
+        // token, we use the "penalty" to compute the allowed
+        // max cost of an alternate back-trace.  If we find an
+        // alternate back trace with cost below that
+        // threshold, we pursue it instead (but also output
+        // the long token).
+
+        final int penalty = computeSecondBestThreshold(backPos, pos-backPos);
+        
+        if (penalty > 0) {
+          if (VERBOSE) {
+            System.out.println("  compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
+          }
+
+          // Use the penalty to set maxCost on the 2nd best
+          // segmentation:
+          int maxCost = posData.costs[bestIDX] + penalty;
+          if (lastLeftWordID != -1) {
+            maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
+          }
+
+          // Now, prune all too-long tokens from the graph:
+          pruneAndRescore(backPos, pos,
+                          posData.backIndex[bestIDX]);
+
+          // Finally, find 2nd best back-trace and resume
+          // backtrace there:
+          int leastCost = Integer.MAX_VALUE;
+          int leastIDX = -1;
+          for(int idx=0;idx<posData.count;idx++) {
+            int cost = posData.costs[idx];
+            //System.out.println("    idx=" + idx + " prevCost=" + cost);
+            
+            if (lastLeftWordID != -1) {
+              cost += costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
+                                lastLeftWordID);
+              //System.out.println("      += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
+              //lastLeftWordID) + " -> " + cost);
+            }
+            //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
+            //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
+            if (cost < leastCost) {
+              //System.out.println("      ** ");
+              leastCost = cost;
+              leastIDX = idx;
+            }
+          }
+          //System.out.println("  leastIDX=" + leastIDX);
+
+          if (VERBOSE) {
+            System.out.println("  afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
+          }
+
+          if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
+            // We should have pruned the altToken from the graph:
+            assert posData.backPos[leastIDX] != backPos;
+
+            // Save the current compound token, to output when
+            // this alternate path joins back:
+            altToken = new Token(backID,
+                                 fragment,
+                                 backPos - lastBackTracePos,
+                                 length,
+                                 backType,
+                                 backPos,
+                                 getDict(backType));
+
+            // Redirect our backtrace to 2nd best:
+            bestIDX = leastIDX;
+
+            backPos = posData.backPos[bestIDX];
+            length = pos - backPos;
+            backType = posData.backType[bestIDX];
+            backID = posData.backID[bestIDX];
+            backCount = 0;
+            
+          } else {
+            // I think in theory it's possible there is no
+            // 2nd best path, which is fine; in this case we
+            // only output the compound token:
+          }
+        }
+      }
+
+      final int offset = backPos - lastBackTracePos;
+      assert offset >= 0;
+
+      if (altToken != null && altToken.getPosition() >= backPos) {
+
+        // We've backtraced to the position where the
+        // compound token starts; add it now:
+
+        // The pruning we did when we created the altToken
+        // ensures that the back trace will align back with
+        // the start of the altToken:
+        // cannot assert...
+        //assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
+
+        if (VERBOSE) {
+          System.out.println("    add altToken=" + altToken);
+        }
+        if (backCount > 0) {
+          backCount++;
+          altToken.setPositionLength(backCount);
+          pending.add(altToken);
+        } else {
+          // This means alt token was all punct tokens:
+          assert discardPunctuation;
+        }
+        altToken = null;
+      }
+
+      final Dictionary dict = getDict(backType);
+
+      if (backType == Type.USER) {
+
+        // Expand the phraseID we recorded into the actual
+        // segmentation:
+        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
+        int wordID = wordIDAndLength[0];
+        int current = 0;
+        for(int j=1; j < wordIDAndLength.length; j++) {
+          final int len = wordIDAndLength[j];
+          //System.out.println("    add user: len=" + len);
+          pending.add(new Token(wordID+j-1,
+                                fragment,
+                                current + offset,
+                                len,
+                                Type.USER,
+                                current + backPos,
+                                dict));
+          if (VERBOSE) {
+            System.out.println("    add USER token=" + pending.get(pending.size()-1));
+          }
+          current += len;
+        }
+
+        // Reverse the tokens we just added, because when we
+        // serve them up from incrementToken we serve in
+        // reverse:
+        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
+                                            pending.size()));
+
+        backCount += wordIDAndLength.length-1;
+      } else {
+
+        if (extendedMode && backType == Type.UNKNOWN) {
+          // In EXTENDED mode we convert unknown word into
+          // unigrams:
+          int unigramTokenCount = 0;
+          for(int i=length-1;i>=0;i--) {
+            int charLen = 1;
+            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
+              i--;
+              charLen = 2;
+            }
+            //System.out.println("    extended tok offset="
+            //+ (offset + i));
+            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
+              pending.add(new Token(CharacterDefinition.NGRAM,
+                                    fragment,
+                                    offset + i,
+                                    charLen,
+                                    Type.UNKNOWN,
+                                    backPos + i,
+                                    unkDictionary));
+              unigramTokenCount++;
+            }
+          }
+          backCount += unigramTokenCount;
+          
+        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
+          pending.add(new Token(backID,
+                                fragment,
+                                offset,
+                                length,
+                                backType,
+                                backPos,
+                                dict));
+          if (VERBOSE) {
+            System.out.println("    add token=" + pending.get(pending.size()-1));
+          }
+          backCount++;
+        } else {
+          if (VERBOSE) {
+            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
+          }
+        }
+      }
+
+      lastLeftWordID = dict.getLeftId(backID);
+      pos = backPos;
+      bestIDX = posData.backIndex[bestIDX];
+    }
+
+    lastBackTracePos = endPos;
+
+    if (VERBOSE) {
+      System.out.println("  freeBefore pos=" + endPos);
+    }
+    // Notify the circular buffers that we are done with
+    // these positions:
+    buffer.freeBefore(endPos);
+    positions.freeBefore(endPos);
+  }
+
+  Dictionary getDict(Type type) {
+    return dictionaryMap.get(type);
+  }
+
+  private static boolean isPunctuation(char ch) {
+    switch(Character.getType(ch)) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
 }
Index: lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java	(revision 1294102)
+++ lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java	(working copy)
@@ -29,6 +29,8 @@
   public void testAttributes() {
     _TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
       Collections.singletonMap(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1));
+    _TestUtil.assertAttributeReflection(new PositionLengthAttributeImpl(),
+      Collections.singletonMap(PositionLengthAttribute.class.getName()+"#positionLength", 1));
     _TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
       Collections.singletonMap(FlagsAttribute.class.getName()+"#flags", 0));
     _TestUtil.assertAttributeReflection(new TypeAttributeImpl(),
Index: lucene/core/src/test/org/apache/lucene/util/TestRollingCharBuffer.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/util/TestRollingCharBuffer.java	(revision 0)
+++ lucene/core/src/test/org/apache/lucene/util/TestRollingCharBuffer.java	(working copy)
@@ -0,0 +1,94 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+public class TestRollingCharBuffer extends LuceneTestCase {
+
+  public void test() throws Exception {
+    final int ITERS = atLeast(1000);
+    
+    RollingCharBuffer buffer = new RollingCharBuffer();
+
+    for(int iter=0;iter<ITERS;iter++) {
+      final int stringLen = random.nextBoolean() ? random.nextInt(50) : random.nextInt(20000);
+      final String s;
+      if (stringLen == 0) {
+        s = "";
+      } else {
+        s = _TestUtil.randomUnicodeString(random, stringLen);
+      }
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter + " s.length()=" + s.length());
+      }
+      buffer.reset(new StringReader(s));
+      int nextRead = 0;
+      int availCount = 0;
+      while(nextRead < s.length()) {
+        if (VERBOSE) {
+          System.out.println("  cycle nextRead=" + nextRead + " avail=" + availCount);
+        }
+        if (availCount == 0 || random.nextBoolean()) {
+          // Read next char
+          if (VERBOSE) {
+            System.out.println("    new char");
+          }
+          assertEquals(s.charAt(nextRead), buffer.get(nextRead));
+          nextRead++;
+          availCount++;
+        } else if (random.nextBoolean()) {
+          // Read previous char
+          int pos = _TestUtil.nextInt(random, nextRead-availCount, nextRead-1);
+          if (VERBOSE) {
+            System.out.println("    old char pos=" + pos);
+          }
+          assertEquals(s.charAt(pos), buffer.get(pos));
+        } else {
+          // Read slice
+          int length;
+          if (availCount == 1) {
+            length = 1;
+          } else {
+            length = _TestUtil.nextInt(random, 1, availCount);
+          }
+          int start;
+          if (length == availCount) {
+            start = nextRead - availCount;
+          } else {
+            start = nextRead - availCount + random.nextInt(availCount-length);
+          }
+          if (VERBOSE) {
+            System.out.println("    slice start=" + start + " length=" + length);
+          }
+          assertEquals(s.substring(start, start+length),
+                       new String(buffer.get(start, length)));
+        }
+
+        if (availCount > 0 && random.nextInt(20) == 17) {
+          final int toFree = random.nextInt(availCount);
+          if (VERBOSE) {
+            System.out.println("    free " + toFree + " (avail=" + (availCount-toFree) + ")");
+          }
+          buffer.freeBefore(nextRead-(availCount-toFree));
+          availCount -= toFree;
+        }
+      }
+    }
+  }
+}

Property changes on: lucene/core/src/test/org/apache/lucene/util/TestRollingCharBuffer.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java	(revision 0)
+++ lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java	(working copy)
@@ -0,0 +1,74 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeImpl;
+
+/** See {@link PositionLengthAttribute}. */
+public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
+  private int positionLength = 1;
+  
+  /** @param positionLength how many positions this token
+   *  spans.  NOTE: this is optional, and most analyzers
+   *  don't change the default value (1). */
+  public void setPositionLength(int positionLength) {
+    if (positionLength < 1) {
+      throw new IllegalArgumentException
+        ("Position length must be 1 or greater: got " + positionLength);
+    }
+    this.positionLength = positionLength;
+  }
+
+  /** Returns the position length of this Token.
+   * @see #setPositionLength    
+   */
+  public int getPositionLength() {
+    return positionLength;
+  }
+
+  @Override
+  public void clear() {
+    this.positionLength = 1;
+  }
+  
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    }
+    
+    if (other instanceof PositionLengthAttributeImpl) {
+      PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
+      return positionLength ==  _other.positionLength;
+    }
+ 
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return positionLength;
+  }
+  
+  @Override
+  public void copyTo(AttributeImpl target) {
+    PositionLengthAttribute t = (PositionLengthAttribute) target;
+    t.setPositionLength(positionLength);
+  }  
+}

Property changes on: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java	(revision 1294102)
+++ lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java	(working copy)
@@ -52,9 +52,10 @@
    * @param positionIncrement the distance from the prior term
    */
   public void setPositionIncrement(int positionIncrement) {
-    if (positionIncrement < 0)
+    if (positionIncrement < 0) {
       throw new IllegalArgumentException
-        ("Increment must be zero or greater: " + positionIncrement);
+        ("Increment must be zero or greater: got " + positionIncrement);
+    }
     this.positionIncrement = positionIncrement;
   }
 
@@ -77,7 +78,8 @@
     }
     
     if (other instanceof PositionIncrementAttributeImpl) {
-      return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
+      PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
+      return positionIncrement ==  _other.positionIncrement;
     }
  
     return false;
@@ -93,5 +95,4 @@
     PositionIncrementAttribute t = (PositionIncrementAttribute) target;
     t.setPositionIncrement(positionIncrement);
   }  
-
 }
Index: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java	(revision 0)
+++ lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java	(working copy)
@@ -0,0 +1,41 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+/** The positionLength determines how many positions this
+ *  token spans.  Very few analyzer components actually
+ *  produce this attribute, and indexing ignores it, but
+ *  it's useful to express the graph structure naturally
+ *  produced by decompounding, word splitting/joining,
+ *  synonym filtering, etc.
+ *
+ * <p>The default value is one. */
+
+public interface PositionLengthAttribute extends Attribute {
+  /** @param positionLength how many positions this token
+   *  spans. */
+  public void setPositionLength(int positionLength);
+
+  /** Returns the position length of this Token.
+   * @see #setPositionLength
+   */
+  public int getPositionLength();
+}
+

Property changes on: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/fst/FST.java	(revision 1294102)
+++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java	(working copy)
@@ -840,6 +840,7 @@
   }
 
   public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
     final int address = getNodeAddress(node);
     in.pos = address;
     //System.out.println("  readFirstRealTargtArc address="
@@ -936,6 +937,7 @@
   /** Never returns null, but you should never call this if
    *  arc.isLast() is true. */
   public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
 
     // TODO: can't assert this because we call from readFirstArc
     // assert !flag(arc.flags, BIT_LAST_ARC);
@@ -1019,6 +1021,7 @@
    *  This returns null if the arc was not found, else the incoming arc. */
   public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
     assert cachedRootArcs != null;
+    assert in.bytes == bytes;
 
     if (labelToMatch == END_LABEL) {
       if (follow.isFinal()) {
@@ -1225,17 +1228,20 @@
 
   /** Expert */
   public static abstract class BytesReader extends DataInput {
-    int pos;
+    protected int pos;
+    protected final byte[] bytes;
+    protected BytesReader(byte[] bytes, int pos) {
+      this.bytes = bytes;
+      this.pos = pos;
+    }
     abstract void skip(int byteCount);
     abstract void skip(int base, int byteCount);
   }
 
   final static class ReverseBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ReverseBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override
@@ -1262,11 +1268,9 @@
   // TODO: can we use just ByteArrayDataInput...?  need to
   // add a .skipBytes to DataInput.. hmm and .setPosition
   final static class ForwardBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ForwardBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override
Index: lucene/core/src/java/org/apache/lucene/util/RollingCharBuffer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/RollingCharBuffer.java	(revision 0)
+++ lucene/core/src/java/org/apache/lucene/util/RollingCharBuffer.java	(working copy)
@@ -0,0 +1,148 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+/** Acts like a forever growing char[] as you read
+ *  characters into it from the provided reader, but
+ *  internally it uses a circular buffer to only hold the
+ *  characters that haven't been freed yet.  This is like a
+ *  PushbackReader, except you don't have to specify
+ *  up-front the max size of the buffer, but you do have to
+ *  periodically call {@link #freeBefore}. */
+
+public final class RollingCharBuffer {
+
+  private Reader reader;
+
+  private char[] buffer = new char[32];
+
+  // Next array index to write to in buffer:
+  private int nextWrite;
+
+  // Next absolute position to read from reader:
+  private int nextPos;
+
+  // How many valid chars (wrapped) are in the buffer:
+  private int count;
+
+  // True if we hit EOF
+  private boolean end;
+    
+  /** Clear array and switch to new reader. */
+  public void reset(Reader reader) {
+    this.reader = reader;
+    nextPos = 0;
+    nextWrite = 0;
+    count = 0;
+    end = false;
+  }
+
+  /* Absolute position read.  NOTE: pos must not jump
+   * ahead by more than 1!  Ie, it's OK to read arbitarily
+   * far back (just not prior to the last {@link
+   * #freeBefore}), but NOT ok to read arbitrarily far
+   * ahead.  Returns -1 if you hit EOF. */
+  public int get(int pos) throws IOException {
+    //System.out.println("    get pos=" + pos + " nextPos=" + nextPos + " count=" + count);
+    if (pos == nextPos) {
+      if (end) {
+        return -1;
+      }
+      final int ch = reader.read();
+      if (ch == -1) {
+        end = true;
+        return -1;
+      }
+      if (count == buffer.length) {
+        // Grow
+        final char[] newBuffer = new char[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_CHAR)];
+        //System.out.println(Thread.currentThread().getName() + ": cb grow " + newBuffer.length);
+        System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length - nextWrite);
+        System.arraycopy(buffer, 0, newBuffer, buffer.length - nextWrite, nextWrite);
+        nextWrite = buffer.length;
+        buffer = newBuffer;
+      }
+      if (nextWrite == buffer.length) {
+        nextWrite = 0;
+      }
+      buffer[nextWrite++] = (char) ch;
+      count++;
+      nextPos++;
+      return ch;
+    } else {
+      // Cannot read from future (except by 1):
+      assert pos < nextPos;
+
+      // Cannot read from already freed past:
+      assert nextPos - pos <= count;
+
+      final int index = getIndex(pos);
+      return buffer[index];
+    }
+  }
+
+  // For assert:
+  private boolean inBounds(int pos) {
+    return pos >= 0 && pos < nextPos && pos >= nextPos - count;
+  }
+
+  private int getIndex(int pos) {
+    int index = nextWrite - (nextPos - pos);
+    if (index < 0) {
+      // Wrap:
+      index += buffer.length;
+      assert index >= 0;
+    }
+    return index;
+  }
+
+  public char[] get(int posStart, int length) {
+    assert length > 0;
+    assert inBounds(posStart): "posStart=" + posStart + " length=" + length;
+    //System.out.println("    buffer.get posStart=" + posStart + " len=" + length);
+      
+    final int startIndex = getIndex(posStart);
+    final int endIndex = getIndex(posStart + length);
+    //System.out.println("      startIndex=" + startIndex + " endIndex=" + endIndex);
+
+    final char[] result = new char[length];
+    if (endIndex >= startIndex && length < buffer.length) {
+      System.arraycopy(buffer, startIndex, result, 0, endIndex-startIndex);
+    } else {
+      // Wrapped:
+      final int part1 = buffer.length-startIndex;
+      System.arraycopy(buffer, startIndex, result, 0, part1);
+      System.arraycopy(buffer, 0, result, buffer.length-startIndex, length-part1);
+    }
+    return result;
+  }
+
+  /** Call this to notify us that no chars before this
+   *  absolute position are needed anymore. */
+  public void freeBefore(int pos) {
+    assert pos >= 0;
+    assert pos <= nextPos;
+    final int newCount = nextPos - pos;
+    assert newCount <= count: "newCount=" + newCount + " count=" + count;
+    assert newCount <= buffer.length: "newCount=" + newCount + " buf.length=" + buffer.length;
+    count = newCount;
+  }
+}

Property changes on: lucene/core/src/java/org/apache/lucene/util/RollingCharBuffer.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/CHANGES.txt
===================================================================
--- lucene/contrib/CHANGES.txt	(revision 1294102)
+++ lucene/contrib/CHANGES.txt	(working copy)
@@ -143,6 +143,9 @@
  * LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
    heuristics.  (Christian Moen via Robert Muir)
 
+ * LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words 
+   and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless)
+
  * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
    BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
    joins in both parent to child and child to parent directions.
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java	(revision 0)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java	(working copy)
@@ -0,0 +1,159 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.PrintWriter;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+/** Consumes a TokenStream and outputs the dot (graphviz) string (graph). */
+public class TokenStreamToDot {
+
+  private final TokenStream in;
+  private final CharTermAttribute termAtt;
+  private final PositionIncrementAttribute posIncAtt;
+  private final PositionLengthAttribute posLengthAtt;
+  private final OffsetAttribute offsetAtt;
+  private final String inputText;
+  protected final PrintWriter out;
+
+  /** If inputText is non-null, and the TokenStream has
+   *  offsets, we include the surface form in each arc's
+   *  label. */
+  public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
+    this.in = in;
+    this.out = out;
+    this.inputText = inputText;
+    termAtt = in.addAttribute(CharTermAttribute.class);
+    posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
+    posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+    if (in.hasAttribute(OffsetAttribute.class)) {
+      offsetAtt = in.addAttribute(OffsetAttribute.class);
+    } else {
+      offsetAtt = null;
+    }
+  }
+
+  public void toDot() throws IOException {
+    in.reset();
+    writeHeader();
+
+    // TODO: is there some way to tell dot that it should
+    // make the "main path" a straight line and have the
+    // non-sausage arcs not affect node placement...
+
+    int pos = -1;
+    int lastEndPos = -1;
+    while (in.incrementToken()) {
+      final boolean isFirst = pos == -1;
+      int posInc = posIncAtt.getPositionIncrement();
+      if (isFirst && posInc == 0) {
+        // TODO: hmm are TS's still allowed to do this...?
+        System.err.println("WARNING: first posInc was 0; correcting to 1");
+        posInc = 1;
+      }
+
+      if (posInc > 0) {
+        // New node:
+        pos += posInc;
+        writeNode(pos, Integer.toString(pos));
+      }
+
+      if (posInc > 1) {
+        // Gap!
+        writeArc(lastEndPos, pos, null, "dotted");
+      }
+
+      if (isFirst) {
+        writeNode(-1, null);
+        writeArc(-1, pos, null, null);
+      }
+
+      String arcLabel = termAtt.toString();
+      if (offsetAtt != null) {
+        final int startOffset = offsetAtt.startOffset();
+        final int endOffset = offsetAtt.endOffset();
+        //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
+        if (inputText != null) {
+          arcLabel += "  / " + inputText.substring(startOffset, endOffset);
+        } else {
+          arcLabel += " / " + startOffset + "-" + endOffset;
+        }
+      }
+
+      writeArc(pos, pos + posLengthAtt.getPositionLength(), arcLabel, null);
+      lastEndPos = pos + posLengthAtt.getPositionLength();
+    }
+
+    in.end();
+
+    if (lastEndPos != -1) {
+      // TODO: should we output any final text (from end
+      // offsets) on this arc...?
+      writeNode(-2, null);
+      writeArc(lastEndPos, -2, null, null);
+    }
+
+    writeTrailer();
+  }
+
+  protected void writeArc(int fromNode, int toNode, String label, String style) {
+    out.print("  " + fromNode + " -> " + toNode + " [");
+    if (label != null) {
+      out.print(" label=\"" + label + "\"");
+    }
+    if (style != null) {
+      out.print(" style=\"" + style + "\"");
+    }
+    out.println("]");
+  }
+
+  protected void writeNode(int name, String label) {
+    out.print("  " + name);
+    if (label != null) {
+      out.print(" [label=\"" + label + "\"]");
+    } else {
+      out.print(" [shape=point color=white]");
+    }
+    out.println();
+  }
+
+  private final static String FONT_NAME = "Helvetica";
+
+  /** Override to customize. */
+  protected void writeHeader() {
+    out.println("digraph tokens {");
+    out.println("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];");
+    out.println("  // A2 paper size");
+    out.println("  size = \"34.4,16.5\";");
+    //out.println("  // try to fill paper");
+    //out.println("  ratio = fill;");
+    out.println("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]");
+    out.println("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]");
+    out.println();
+  }
+
+  /** Override to customize. */
+  protected void writeTrailer() {
+    out.println("}");
+  }
+}

Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java	(revision 1294102)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java	(working copy)
@@ -17,13 +17,18 @@
  * limitations under the License.
  */
 
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
 import java.io.Reader;
 import java.io.StringReader;
-import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
- 
+
 import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
@@ -83,7 +88,7 @@
     }
   }
 
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
     
@@ -107,6 +112,12 @@
       assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
       posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
     }
+
+    PositionLengthAttribute posLengthAtt = null;
+    if (posLengths != null) {
+      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
+      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
+    }
     
     ts.reset();
     for (int i = 0; i < output.length; i++) {
@@ -116,6 +127,7 @@
       if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
       if (typeAtt != null) typeAtt.setType("bogusType");
       if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
+      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
       
       checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
       assertTrue("token "+i+" does not exist", ts.incrementToken());
@@ -130,6 +142,8 @@
         assertEquals("type "+i, types[i], typeAtt.type());
       if (posIncrements != null)
         assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
+      if (posLengths != null)
+        assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
       
       // we can enforce some basic things about a few attributes even if the caller doesn't check:
       if (offsetAtt != null) {
@@ -138,14 +152,18 @@
         assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
         if (finalOffset != null) {
           assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
-          assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
+          assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
+                     offsetAtt.endOffset() <= finalOffset.intValue());
         }
       }
       if (posIncrAtt != null) {
         assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
       }
+      if (posLengthAtt != null) {
+        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
+      }
     }
-    assertFalse("end of stream", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
     ts.end();
     if (finalOffset != null)
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@@ -155,65 +173,81 @@
     ts.close();
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
+  }
+
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
   }
 
   public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, null, null, null);
+    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, types, null, null);
+    assertTokenStreamContents(ts, output, null, null, types, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
+    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
   }
 
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
+  }
+  
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
   }
   
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
+  }
+  
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, null, null);
+    assertAnalyzesTo(a, input, output, null, null, null, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, types, null);
+    assertAnalyzesTo(a, input, output, null, null, types, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
+    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
   }
+
+  public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
+    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
+  }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
-    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
-    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
   }
   
 
   public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
   }
   
   public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
@@ -313,7 +347,7 @@
       }
 
       if (VERBOSE) {
-        System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
       }
 
       int remainder = random.nextInt(10);
@@ -323,10 +357,12 @@
       CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
       OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
       PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+      PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
       TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
       List<String> tokens = new ArrayList<String>();
       List<String> types = new ArrayList<String>();
       List<Integer> positions = new ArrayList<Integer>();
+      List<Integer> positionLengths = new ArrayList<Integer>();
       List<Integer> startOffsets = new ArrayList<Integer>();
       List<Integer> endOffsets = new ArrayList<Integer>();
       ts.reset();
@@ -334,6 +370,7 @@
         tokens.add(termAtt.toString());
         if (typeAtt != null) types.add(typeAtt.type());
         if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+        if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
         if (offsetAtt != null) {
           startOffsets.add(offsetAtt.startOffset());
           endOffsets.add(offsetAtt.endOffset());
@@ -344,11 +381,21 @@
       // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
       if (!tokens.isEmpty()) {
         if (VERBOSE) {
-          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
+          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
         }
         reader = new StringReader(text);
         ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-        if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+          // offset + pos + posLength + type
+          assertTokenStreamContents(ts, 
+            tokens.toArray(new String[tokens.size()]),
+            toIntArray(startOffsets),
+            toIntArray(endOffsets),
+            types.toArray(new String[types.size()]),
+            toIntArray(positions),
+            toIntArray(positionLengths),
+            text.length());
+        } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
           // offset + pos + type
           assertTokenStreamContents(ts, 
             tokens.toArray(new String[tokens.size()]),
@@ -356,7 +403,18 @@
             toIntArray(endOffsets),
             types.toArray(new String[types.size()]),
             toIntArray(positions),
+            null,
             text.length());
+        } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+          // offset + pos + posLength
+          assertTokenStreamContents(ts, 
+              tokens.toArray(new String[tokens.size()]),
+              toIntArray(startOffsets),
+              toIntArray(endOffsets),
+              null,
+              toIntArray(positions),
+              toIntArray(positionLengths),
+              text.length());
         } else if (posIncAtt != null && offsetAtt != null) {
           // offset + pos
           assertTokenStreamContents(ts, 
@@ -365,6 +423,7 @@
               toIntArray(endOffsets),
               null,
               toIntArray(positions),
+              null,
               text.length());
         } else if (offsetAtt != null) {
           // offset
@@ -374,6 +433,7 @@
               toIntArray(endOffsets),
               null,
               null,
+              null,
               text.length());
         } else {
           // terms only
@@ -383,6 +443,22 @@
       }
     }
   }
+
+  protected String toDot(Analyzer a, String inputText) throws IOException {
+    final StringWriter sw = new StringWriter();
+    final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+    ts.reset();
+    new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
+    return sw.toString();
+  }
+
+  protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
+    Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8");
+    final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+    ts.reset();
+    new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
+    w.close();
+  }
   
   static int[] toIntArray(List<Integer> list) {
     int ret[] = new int[list.size()];
Index: solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java
===================================================================
--- solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java	(revision 1294102)
+++ solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java	(working copy)
@@ -50,7 +50,7 @@
     factory.inform(new SolrResourceLoader(null, null));
     TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア"));
     assertTokenStreamContents(ts,
-        new String[] { "シニア", "ソフトウェア", "エンジニア" }
+                              new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
     );
   }
   
Index: solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
===================================================================
--- solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java	(revision 1294102)
+++ solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java	(working copy)
@@ -28,8 +28,7 @@
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.util.IOUtils;
 import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@
   
   @Override
   public Tokenizer create(Reader input) {
-    return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+    return new KuromojiTokenizer(input, userDictionary, true, mode);
   }
   
   private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@
     if (mode != null) {
       return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
     } else {
-      return Segmenter.DEFAULT_MODE;
+      return KuromojiTokenizer.DEFAULT_MODE;
     }
   }
 }
