diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml index 7afa31d3e92..2045d42b329 100644 --- a/lucene/analysis/kuromoji/build.xml +++ b/lucene/analysis/kuromoji/build.xml @@ -31,19 +31,32 @@ - + - + + + + + + + + + + + @@ -60,10 +73,15 @@ - - - - + + + + + @@ -71,7 +89,7 @@ originalfile="${dict.src.dir}/Noun.proper.csv"/> - + diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java index 5004b62b3ca..4df26864a0e 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java @@ -53,7 +53,7 @@ public class DictionaryBuilder { .build(inputDir) .write(outputDir); - new UnknownDictionaryBuilder(encoding) + new UnknownDictionaryBuilder(format, encoding) .build(inputDir) .write(outputDir); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 4bb8d59b9ef..371fcd64200 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -73,8 +73,10 @@ class TokenInfoDictionaryBuilder { while ((line = reader.readLine()) != null) { String[] entry = CSVUtil.parse(line); - if (entry.length < 13) { + if (this.format == DictionaryFormat.IPADIC && entry.length < 13) { throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line); + } else if (this.format == DictionaryFormat.UNIDIC && entry.length < 21) { + throw new IllegalArgumentException("Entry in CSV is not valid (21 field values expected): " + line); } lines.add(formatEntry(entry)); @@ -149,9 +151,10 @@ class TokenInfoDictionaryBuilder { * 3 - word cost * 4-9 - pos * 10 - base form reading - * 11 - base form + * 11 - lexeme - not used * 12 - surface form * 13 - surface reading + * 14 - orth form */ private String[] formatEntry(String[] features) { @@ -169,7 +172,7 @@ class TokenInfoDictionaryBuilder { features2[7] = features[7]; features2[8] = features[8]; features2[9] = features[9]; - features2[10] = features[11]; + features2[10] = features[14]; // If the surface reading is non-existent, use surface form for reading and pronunciation. // This happens with punctuation in UniDic and there are possibly other cases as well diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java index 4316add3a6c..3b530904ad7 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java @@ -28,14 +28,18 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.ja.dict.CharacterDefinition; +import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; private final String encoding; + private final DictionaryFormat format; - UnknownDictionaryBuilder(String encoding) { + + UnknownDictionaryBuilder(DictionaryFormat format, String encoding) { this.encoding = encoding; + this.format = format; } public UnknownDictionaryWriter build(Path dir) throws IOException { @@ -61,7 +65,12 @@ class UnknownDictionaryBuilder { while ((line = lineReader.readLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. - final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry + final String[] parsed; + if (this.format == DictionaryFormat.UNIDIC) { + parsed = CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column + } else { + parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry + } lines.add(parsed); } } diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat index 4b8bd4b5b84..397d601767f 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat index 7679f14d8e3..d99de6d25b2 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat index 09f1e46680b..3189486e969 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat index 9328c53ee38..a9e0c70b374 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat index e727d90c40a..a4e509fc4a4 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat index 13d09bc2eb6..ed34cacdeb8 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat index 16f0a822cce..18a364deffa 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat index e709dccab5b..139bdf5e504 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat differ diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat index e8db0b363b2..9299529eea4 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat differ