Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (revision 1229796) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (working copy) @@ -65,14 +65,7 @@ } public int get(int forwardId, int backwardId) { - // FIXME: There seems to be something wrong with the double array trie in some rare - // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around - // and return a high cost to advise Mr. Viterbi strongly to not use this transition - if (backwardId < costs.length && forwardId < costs[backwardId].length ) { - return costs[backwardId][forwardId]; - } else { - return 50000; - } + return costs[backwardId][forwardId]; } public static ConnectionCosts getInstance() { Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (revision 1229796) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (working copy) @@ -166,7 +166,7 @@ List bestPath = viterbi.search(lattice); for (ViterbiNode node : bestPath) { int wordId = node.getWordId(); - if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS + if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS continue; } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) { continue; // Do not emit punctuation Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (revision 1229796) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (working copy) @@ -206,7 +206,7 @@ int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr FST.Arc arc = new FST.Arc(); - ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN); + ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN); addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr); // Process user dictionary; @@ -277,7 +277,7 @@ } } - ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN); + ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN); addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0 ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr}; Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java =================================================================== --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (revision 1229796) +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (working copy) @@ -175,7 +175,19 @@ assertEquals("助動詞", tokens.get(7).getPartOfSpeech()); assertEquals("記号-句点", tokens.get(8).getPartOfSpeech()); } - + + public void testYabottai() { + List tokens = segmenter.tokenize("やぼったい"); + assertEquals(1, tokens.size()); + assertEquals("やぼったい", tokens.get(0).getSurfaceFormString()); + } + + public void testTsukitosha() { + List tokens = segmenter.tokenize("突き通しゃ"); + assertEquals(1, tokens.size()); + assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString()); + } + public void testBocchan() throws Exception { doTestBocchan(1); } Index: modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java =================================================================== --- modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (revision 1229796) +++ modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (working copy) @@ -45,7 +45,7 @@ public class TokenInfoDictionaryBuilder { /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */ - private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file. + private int offset = 0; private String encoding = "euc-jp";