diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index 7afa31d3e92..2045d42b329 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -31,19 +31,32 @@
-
+
-
+
+
+
+
+
+
+
+
+
+
+
@@ -60,10 +73,15 @@
-
-
-
-
+
+
+
+
+
@@ -71,7 +89,7 @@
originalfile="${dict.src.dir}/Noun.proper.csv"/>
-
+
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
index 5004b62b3ca..4df26864a0e 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
@@ -53,7 +53,7 @@ public class DictionaryBuilder {
.build(inputDir)
.write(outputDir);
- new UnknownDictionaryBuilder(encoding)
+ new UnknownDictionaryBuilder(format, encoding)
.build(inputDir)
.write(outputDir);
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index 4bb8d59b9ef..371fcd64200 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -73,8 +73,10 @@ class TokenInfoDictionaryBuilder {
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
- if (entry.length < 13) {
+ if (this.format == DictionaryFormat.IPADIC && entry.length < 13) {
throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
+ } else if (this.format == DictionaryFormat.UNIDIC && entry.length < 21) {
+ throw new IllegalArgumentException("Entry in CSV is not valid (21 field values expected): " + line);
}
lines.add(formatEntry(entry));
@@ -149,9 +151,10 @@ class TokenInfoDictionaryBuilder {
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
- * 11 - base form
+ * 11 - lexeme - not used
* 12 - surface form
* 13 - surface reading
+ * 14 - orth form
*/
private String[] formatEntry(String[] features) {
@@ -169,7 +172,7 @@ class TokenInfoDictionaryBuilder {
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
- features2[10] = features[11];
+ features2[10] = features[14];
// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
index 4316add3a6c..3b530904ad7 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
@@ -28,14 +28,18 @@ import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
+import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private final String encoding;
+ private final DictionaryFormat format;
- UnknownDictionaryBuilder(String encoding) {
+
+ UnknownDictionaryBuilder(DictionaryFormat format, String encoding) {
this.encoding = encoding;
+ this.format = format;
}
public UnknownDictionaryWriter build(Path dir) throws IOException {
@@ -61,7 +65,12 @@ class UnknownDictionaryBuilder {
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
- final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ final String[] parsed;
+ if (this.format == DictionaryFormat.UNIDIC) {
+ parsed = CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column
+ } else {
+ parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ }
lines.add(parsed);
}
}
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat
index 4b8bd4b5b84..397d601767f 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/CharacterDefinition.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat
index 7679f14d8e3..d99de6d25b2 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ConnectionCosts.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat
index 09f1e46680b..3189486e969 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 9328c53ee38..a9e0c70b374 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat
index e727d90c40a..a4e509fc4a4 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$posDict.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat
index 13d09bc2eb6..ed34cacdeb8 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat
index 16f0a822cce..18a364deffa 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$buffer.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat
index e709dccab5b..139bdf5e504 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$posDict.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat
index e8db0b363b2..9299529eea4 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/UnknownDictionary$targetMap.dat differ