Index: contrib/icu/build.xml
===================================================================
--- contrib/icu/build.xml (revision 937043)
+++ contrib/icu/build.xml (working copy)
@@ -43,7 +43,38 @@
Warning: only works on a big-endian platform!
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: contrib/icu/src/data/uax29/Hebrew.rbbi
===================================================================
--- contrib/icu/src/data/uax29/Hebrew.rbbi (revision 0)
+++ contrib/icu/src/data/uax29/Hebrew.rbbi (revision 0)
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# This is an example of rule tailoring for Hebrew.
+# In this example the single-quote is added to the Extend category
+# The double-quote is added to the MidLetter category.
+#
+!!chain;
+$CR = [\p{Word_Break = CR}];
+$LF = [\p{Word_Break = LF}];
+$Newline = [\p{Word_Break = Newline}];
+$Extend = [\p{Word_Break = Extend}\u0027];
+$Format = [\p{Word_Break = Format}];
+$ALetter = [\p{Word_Break = ALetter}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidLetter = [\p{Word_Break = MidLetter}\u0022];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$dictionary = [:LineBreak = Complex_Context:];
+$Control = [\p{Grapheme_Cluster_Break = Control}];
+$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
+
+$ALetterEx = $ALetterPlus ($Extend | $Format)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidLetterEx = $MidLetter ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+!!forward;
+
+$CR $LF;
+[^$CR $LF $Newline]? ($Extend | $Format)+;
+$NumericEx {100};
+$ALetterEx {200};
+$ALetterEx $ALetterEx {200};
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+$NumericEx $NumericEx {100};
+$ALetterEx $NumericEx {200};
+$NumericEx $ALetterEx {200};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$ALetterEx $ExtendNumLetEx {200};
+$NumericEx $ExtendNumLetEx {100};
+$ExtendNumLetEx $ExtendNumLetEx {200};
+$ExtendNumLetEx $ALetterEx {200};
+$ExtendNumLetEx $NumericEx {100};
Index: contrib/icu/src/data/uax29/Khmer.rbbi
===================================================================
--- contrib/icu/src/data/uax29/Khmer.rbbi (revision 0)
+++ contrib/icu/src/data/uax29/Khmer.rbbi (revision 0)
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Parses Khmer text, with orthographic syllable as token.
+#
+# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
+#
+# B = base character (consonant, independent vowel, etc)
+$KhmerBase = [\u1780-\u17B3];
+# R = robat
+$KhmerRobat = [\u17CC];
+# C = consonant shifter
+$KhmerShifter = [\u17C9\u17CA];
+# S = subscript consonant or independent vowel sign
+$KhmerSub = ([\u17D2] $KhmerBase);
+# V = dependent vowel sign
+$KhmerVowel = [\u17B4-\u17C5];
+# Z = zero-width joiner or non-joiner
+$KhmerZWC = [\u200C\u200D];
+# O = any other sign
+$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
+
+$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend = [\p{Word_Break = Extend}];
+$Format = [\p{Word_Break = Format}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+!!forward;
+$KhmerJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
Index: contrib/icu/src/data/uax29/Lao.rbbi
===================================================================
--- contrib/icu/src/data/uax29/Lao.rbbi (revision 0)
+++ contrib/icu/src/data/uax29/Lao.rbbi (revision 0)
@@ -0,0 +1,192 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Parses Lao text, with syllable as token.
+#
+# The definition of Lao syllable is based from:
+#
+# Syllabification of Lao Script for Line Breaking
+# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
+# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
+# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+#
+# NOTE:
+# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
+# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
+#
+# Syllable structure, where X is the nuclear consonant:
+#
+# +----+
+# | X5 |
+# +----+
+# | X4 |
+# +----+----+----+----+----+----+----+-----+
+# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
+# +----+----+----+----+----+----+----+-----+
+# | X2 |
+# +----+
+# | X3 |
+# +----+
+#
+# X0 represents a vowel which occurs before the nuclear consonant.
+# It can always define the beginning of syllable.
+$X0 = [\u0EC0-\u0EC4];
+# X1 is a combination consonant which comes before the nuclear consonant,
+# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
+$X1 = [\u0EAB];
+# X represents the nuclear consonant.
+$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
+# X2 is a combination consonant which comes after the nuclear consonant,
+# which is placed under or next to the nuclear consonant.
+$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
+# X3 represents a vowel which occurs under the nuclear consonant.
+$X3 = [\u0EB8\u0EB9];
+# X4 represents a vowel which occurs above the nuclear consonant.
+$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
+# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
+$X5 = [\u0EC8-\u0ECB];
+# X6 represents a consonant vowel, which occurs after the nuclear consonant.
+# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
+$X6 = [\u0EA7\u0EAD\u0EBD];
+# X7 represents a final vowel.
+# However X7_1 always represents the end of syllable and it never exists with tone mark.
+$X7 = [\u0EB0\u0EB2\u0EB3];
+# X8 represents an alternate consonant.
+$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
+# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
+$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
+# X10 represents a sign mark.
+# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
+$X10 = [\u0EAF\u0EC6\u0ECC];
+
+# Section 1
+$X0_1 = [\u0EC0];
+$X4_1_2 = [\u0EB4\u0EB5];
+$X4_3_4 = [\u0EB6\u0EB7];
+$X4_6 = [\u0EBB];
+$X4_7 = [\u0EB1];
+$X6_2 = [\u0EAD];
+$X6_3 = [\u0EBD];
+$X7_1 = [\u0EB0];
+$X7_2 = [\u0EB2];
+$X10_1 = [\u0EAF];
+$X10_2 = [\u0EC6];
+$X10_3 = [\u0ECC];
+
+$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
+$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
+
+# Section 2
+$X0_2 = [\u0EC1];
+
+$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
+$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
+
+# Section 3
+$X0_3 = [\u0EC2];
+$X8_3 = [\u0E8D];
+$X8_8 = [\u0EA7];
+
+$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
+$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
+
+$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
+
+# Section 4
+$X0_4 = [\u0EC4];
+$X6_1 = [\u0EA7];
+
+$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 5
+$X0_5 = [\u0EC3];
+
+$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 6
+$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 7
+$X4_1_4 = [\u0EB4-\u0EB7];
+
+$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 8
+$X4_5 = [\u0ECD];
+
+$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 9
+
+$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
+
+$Rule9 = ($Rule9_1 | $Rule9_2);
+
+# Section 10
+$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 11
+$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 12
+$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
+
+# Section 13
+$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 14
+$X7_3 = [\u0EB3];
+
+$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend = [\p{Word_Break = Extend}];
+$Format = [\p{Word_Break = Format}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+!!forward;
+
+$LaoJoinedSyllableEx {200};
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
Index: contrib/icu/src/data/uax29/Myanmar.rbbi
===================================================================
--- contrib/icu/src/data/uax29/Myanmar.rbbi (revision 0)
+++ contrib/icu/src/data/uax29/Myanmar.rbbi (revision 0)
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Parses Myanmar text, with syllable as token.
+#
+
+$Cons = [[:Other_Letter:]&[:Myanmar:]];
+$Virama = [\u1039];
+$Asat = [\u103A];
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+#
+# default numerical definitions
+#
+$Extend = [\p{Word_Break = Extend}];
+$Format = [\p{Word_Break = Format}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+$ConsEx = $Cons ($Extend | $Format)*;
+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
+
+!!forward;
+$MyanmarJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java (revision 0)
@@ -0,0 +1,170 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.DictionaryBasedBreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Contain all the issues surrounding BreakIterators in ICU in one place.
+ * Basically this boils down to the fact that they aren't very friendly to any
+ * sort of OO design.
+ *
+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+ * BreakIterator from RuleBasedBreakIterator
+ *
+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+ * doesn't actually behave as a subclass: it always returns 0 for
+ * getRuleStatus():
+ * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+ * tags
+ */
+abstract class BreakIteratorWrapper {
+ protected final CharArrayIterator textIterator = new CharArrayIterator();
+ protected char text[];
+ protected int start;
+ protected int length;
+
+ abstract int next();
+ abstract int current();
+ abstract int getRuleStatus();
+ abstract void setText(CharacterIterator text);
+
+ void setText(char text[], int start, int length) {
+ this.text = text;
+ this.start = start;
+ this.length = length;
+ textIterator.setText(text, start, length);
+ setText(textIterator);
+ }
+
+ /**
+ * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
+ * treat it like a generic BreakIterator If its any other
+ * RuleBasedBreakIterator, the rule status can be used for token type. If its
+ * any other BreakIterator, the rulestatus method is not available, so treat
+ * it like a generic BreakIterator.
+ */
+ static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
+ if (breakIterator instanceof RuleBasedBreakIterator
+ && !(breakIterator instanceof DictionaryBasedBreakIterator))
+ return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
+ else
+ return new BIWrapper(breakIterator);
+ }
+
+ /**
+ * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
+ * a DictionaryBasedBreakIterator) behaves correctly.
+ */
+ static final class RBBIWrapper extends BreakIteratorWrapper {
+ private final RuleBasedBreakIterator rbbi;
+
+ RBBIWrapper(RuleBasedBreakIterator rbbi) {
+ this.rbbi = rbbi;
+ }
+
+ @Override
+ int current() {
+ return rbbi.current();
+ }
+
+ @Override
+ int getRuleStatus() {
+ return rbbi.getRuleStatus();
+ }
+
+ @Override
+ int next() {
+ return rbbi.next();
+ }
+
+ @Override
+ void setText(CharacterIterator text) {
+ rbbi.setText(text);
+ }
+ }
+
+ /**
+ * Generic BreakIterator wrapper: Either the rulestatus method is not
+ * available or always returns 0. Calculate a rulestatus here so it behaves
+ * like RuleBasedBreakIterator.
+ *
+ * Note: This is slower than RuleBasedBreakIterator.
+ */
+ static final class BIWrapper extends BreakIteratorWrapper {
+ private final BreakIterator bi;
+ private int status;
+
+ BIWrapper(BreakIterator bi) {
+ this.bi = bi;
+ }
+
+ @Override
+ int current() {
+ return bi.current();
+ }
+
+ @Override
+ int getRuleStatus() {
+ return status;
+ }
+
+ @Override
+ int next() {
+ int current = bi.current();
+ int next = bi.next();
+ status = calcStatus(current, next);
+ return next;
+ }
+
+ private int calcStatus(int current, int next) {
+ if (current == BreakIterator.DONE || next == BreakIterator.DONE)
+ return RuleBasedBreakIterator.WORD_NONE;
+
+ int begin = start + current;
+ int end = start + next;
+
+ int codepoint;
+ for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
+ codepoint = UTF16.charAt(text, 0, end, begin);
+
+ if (UCharacter.isDigit(codepoint))
+ return RuleBasedBreakIterator.WORD_NUMBER;
+ else if (UCharacter.isLetter(codepoint)) {
+ // TODO: try to separately specify ideographic, kana?
+ // [currently all bundled as letter for this case]
+ return RuleBasedBreakIterator.WORD_LETTER;
+ }
+ }
+
+ return RuleBasedBreakIterator.WORD_NONE;
+ }
+
+ @Override
+ void setText(CharacterIterator text) {
+ bi.setText(text);
+ status = RuleBasedBreakIterator.WORD_NONE;
+ }
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\BreakIteratorWrapper.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java (revision 0)
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+/**
+ * Wraps a char[] as CharacterIterator for processing with a BreakIterator
+ */
+final class CharArrayIterator implements CharacterIterator {
+ private char array[];
+ private int start;
+ private int index;
+ private int length;
+ private int limit;
+
+ public char [] getText() {
+ return array;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Set a new region of text to be examined by this iterator
+ *
+ * @param array text buffer to examine
+ * @param start offset into buffer
+ * @param length maximum length to examine
+ */
+ void setText(final char array[], int start, int length) {
+ this.array = array;
+ this.start = start;
+ this.index = start;
+ this.length = length;
+ this.limit = start + length;
+ }
+
+ public char current() {
+ return (index == limit) ? DONE : array[index];
+ }
+
+ public char first() {
+ index = start;
+ return current();
+ }
+
+ public int getBeginIndex() {
+ return 0;
+ }
+
+ public int getEndIndex() {
+ return length;
+ }
+
+ public int getIndex() {
+ return index - start;
+ }
+
+ public char last() {
+ index = (limit == start) ? limit : limit - 1;
+ return current();
+ }
+
+ public char next() {
+ if (++index >= limit) {
+ index = limit;
+ return DONE;
+ } else {
+ return current();
+ }
+ }
+
+ public char previous() {
+ if (--index < start) {
+ index = start;
+ return DONE;
+ } else {
+ return current();
+ }
+ }
+
+ public char setIndex(int position) {
+ if (position < getBeginIndex() || position > getEndIndex())
+ throw new IllegalArgumentException("Illegal Position: " + position);
+ index = start + position;
+ return current();
+ }
+
+ @Override
+ public Object clone() {
+ CharArrayIterator clone = new CharArrayIterator();
+ clone.setText(array, start, length);
+ clone.index = index;
+ return clone;
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\CharArrayIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java (revision 0)
@@ -0,0 +1,125 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * An internal BreakIterator for multilingual text, following recommendations
+ * from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
+ *
+ * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
+ * design.
+ *
+ * Text is first divided into script boundaries. The processing is then
+ * delegated to the appropriate break iterator for that specific script.
+ *
+ * This break iterator also allows you to retrieve the ISO 15924 script code
+ * associated with a piece of text.
+ *
+ * See also UAX #29, UTR #24
+ */
+final class CompositeBreakIterator {
+ private final ICUTokenizerConfig config;
+ private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
+
+ private BreakIteratorWrapper rbbi;
+ private final ScriptIterator scriptIterator = new ScriptIterator();
+
+ private char text[];
+
+ CompositeBreakIterator(ICUTokenizerConfig config) {
+ this.config = config;
+ }
+
+ /**
+ * Retrieve the next break position. If the RBBI range is exhausted within the
+ * script boundary, examine the next script boundary.
+ *
+ * @return the next break position or BreakIterator.DONE
+ */
+ int next() {
+ int next = rbbi.next();
+ while (next == BreakIterator.DONE && scriptIterator.next()) {
+ rbbi = getBreakIterator(scriptIterator.getScriptCode());
+ rbbi.setText(text, scriptIterator.getScriptStart(),
+ scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
+ next = rbbi.next();
+ }
+ return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
+ + scriptIterator.getScriptStart();
+ }
+
+ /**
+ * Retrieve the current break position.
+ *
+ * @return the current break position or BreakIterator.DONE
+ */
+ int current() {
+ final int current = rbbi.current();
+ return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
+ + scriptIterator.getScriptStart();
+ }
+
+ /**
+ * Retrieve the rule status code (token type) from the underlying break
+ * iterator
+ *
+ * @return rule status code (see RuleBasedBreakIterator constants)
+ */
+ int getRuleStatus() {
+ return rbbi.getRuleStatus();
+ }
+
+ /**
+ * Retrieve the UScript script code for the current token. This code can be
+ * decoded with UScript into a name or ISO 15924 code.
+ *
+ * @return UScript script code for the current token.
+ */
+ int getScriptCode() {
+ return scriptIterator.getScriptCode();
+ }
+
+ /**
+ * Set a new region of text to be examined by this iterator
+ *
+ * @param text buffer of text
+ * @param start offset into buffer
+ * @param length maximum length to examine
+ */
+ void setText(final char text[], int start, int length) {
+ this.text = text;
+ scriptIterator.setText(text, start, length);
+ if (scriptIterator.next()) {
+ rbbi = getBreakIterator(scriptIterator.getScriptCode());
+ rbbi.setText(text, scriptIterator.getScriptStart(),
+ scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
+ } else {
+ rbbi = getBreakIterator(UScript.COMMON);
+ rbbi.setText(text, 0, 0);
+ }
+ }
+
+ private BreakIteratorWrapper getBreakIterator(int scriptCode) {
+ if (wordBreakers[scriptCode] == null)
+ wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getTokenizer(scriptCode));
+ return wordBreakers[scriptCode];
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\CompositeBreakIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (revision 0)
@@ -0,0 +1,111 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Default {@link TokenizerConfig} that is generally applicable
+ * to many languages.
+ *
+ * Generally tokenizes Unicode text according to UAX#29
+ * ({@link BreakIterator#getWordInstance(ULocale.ROOT)}),
+ * but with the following tailorings:
+ *
+ * - Thai text is broken into words with a
+ * {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
+ *
- Lao, Myanmar, and Khmer text is broken into syllables
+ * based on custom BreakIterator rules.
+ *
- Hebrew text has custom tailorings to handle special cases
+ * involving punctuation.
+ *
+ */
+public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
+ /** Token type for words containing ideographic characters */
+ public static final String WORD_IDEO = "";
+ /** Token type for words containing Japanese kana */
+ public static final String WORD_KANA = "";
+ /** Token type for words that contain letters */
+ public static final String WORD_LETTER = "";
+ /** Token type for words that appear to be numbers */
+ public static final String WORD_NUMBER = "";
+
+ /*
+ * the default breakiterators in use. these can be expensive to
+ * instantiate, cheap to clone.
+ */
+ private static final BreakIterator rootBreakIterator =
+ BreakIterator.getWordInstance(ULocale.ROOT);
+ private static final BreakIterator thaiBreakIterator =
+ BreakIterator.getWordInstance(new ULocale("th_TH"));
+ private static final BreakIterator hebrewBreakIterator =
+ readBreakIterator("Hebrew.brk");
+ private static final BreakIterator khmerBreakIterator =
+ readBreakIterator("Khmer.brk");
+ private static final BreakIterator laoBreakIterator =
+ new LaoBreakIterator(readBreakIterator("Lao.brk"));
+ private static final BreakIterator myanmarBreakIterator =
+ readBreakIterator("Myanmar.brk");
+
+ @Override
+ BreakIterator getTokenizer(int script) {
+ switch(script) {
+ case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
+ case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
+ case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
+ case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
+ case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
+ default: return (BreakIterator)rootBreakIterator.clone();
+ }
+ }
+
+ @Override
+ String getType(int script, int ruleStatus) {
+ switch (ruleStatus) {
+ case RuleBasedBreakIterator.WORD_IDEO:
+ return WORD_IDEO;
+ case RuleBasedBreakIterator.WORD_KANA:
+ return WORD_KANA;
+ case RuleBasedBreakIterator.WORD_LETTER:
+ return WORD_LETTER;
+ case RuleBasedBreakIterator.WORD_NUMBER:
+ return WORD_NUMBER;
+ default: /* some other custom code */
+ return "";
+ }
+ }
+
+ private static RuleBasedBreakIterator readBreakIterator(String filename) {
+ InputStream is =
+ DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
+ try {
+ RuleBasedBreakIterator bi =
+ RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
+ is.close();
+ return bi;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\DefaultICUTokenizerConfig.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (revision 0)
@@ -0,0 +1,195 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * Breaks text into words according to UAX #29: Unicode Text Segmentation
+ * (http://www.unicode.org/reports/tr29/)
+ *
+ * Words are broken across script boundaries, then segmented according to
+ * the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
+ *
+ * @see ICUTokenizerConfig
+ */
+public final class ICUTokenizer extends Tokenizer {
+ private static final int IOBUFFER = 4096;
+ private final char buffer[] = new char[IOBUFFER];
+ /** true length of text in the buffer */
+ private int length = 0;
+ /** length in buffer that can be evaluated safely, up to a safe end point */
+ private int usableLength = 0;
+ /** accumulated offset of previous buffers for this reader, for offsetAtt */
+ private int offset = 0;
+
+ private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
+ private final ICUTokenizerConfig config;
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
+
+ /**
+ * Construct a new ICUTokenizer that breaks text into words from the given
+ * Reader.
+ *
+ * The default script-specific handling is used.
+ *
+ * @param input Reader containing text to tokenize.
+ * @see DefaultICUTokenizerConfig
+ */
+ public ICUTokenizer(Reader input) {
+ this(input, new DefaultICUTokenizerConfig());
+ }
+
+ /**
+ * Construct a new ICUTokenizer that breaks text into words from the given
+ * Reader, using a tailored BreakIterator configuration.
+ *
+ * @param input Reader containing text to tokenize.
+ * @param config Tailored BreakIterator configuration
+ */
+ public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
+ super(input);
+ this.config = config;
+ breaker = new CompositeBreakIterator(config);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ if (length == 0)
+ refill();
+ while (!incrementTokenBuffer()) {
+ refill();
+ if (length <= 0) // no more bytes to read;
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ breaker.setText(buffer, 0, 0);
+ length = usableLength = offset = 0;
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ reset();
+ }
+
+ @Override
+ public void end() throws IOException {
+ final int finalOffset = (length < 0) ? offset : offset + length;
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ /*
+ * This tokenizes text based upon the longest matching rule, and because of
+ * this, isn't friendly to a Reader.
+ *
+ * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
+ * text, the last unambiguous break point is found (in this implementation:
+ * white space character) Any remaining characters represent possible partial
+ * words, so are appended to the front of the next chunk.
+ *
+ * There is the possibility that there are no unambiguous break points within
+ * an entire 4kB chunk of text (binary data). So there is a maximum word limit
+ * of 4kB since it will not try to grow the buffer in this case.
+ */
+
+ /**
+ * Returns the last unambiguous break position in the text.
+ *
+ * @return position of character, or -1 if one does not exist
+ */
+ private int findSafeEnd() {
+ for (int i = length - 1; i >= 0; i--)
+ if (UCharacter.isWhitespace(buffer[i]))
+ return i + 1;
+ return -1;
+ }
+
+ /**
+ * Refill the buffer, accumulating the offset and setting usableLength to the
+ * last unambiguous break position
+ *
+ * @throws IOException
+ */
+ private void refill() throws IOException {
+ offset += usableLength;
+ int leftover = length - usableLength;
+ System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+ int requested = buffer.length - leftover;
+ int returned = input.read(buffer, leftover, requested);
+ length = returned < 0 ? leftover : returned + leftover;
+ if (returned < requested) /* reader has been emptied, process the rest */
+ usableLength = length;
+ else { /* still more data to be read, find a safe-stopping place */
+ usableLength = findSafeEnd();
+ if (usableLength < 0)
+ usableLength = length; /*
+ * more than IOBUFFER of text without space,
+ * gonna possibly truncate tokens
+ */
+ }
+
+ breaker.setText(buffer, 0, Math.max(0, usableLength));
+ }
+
+ /*
+ * return true if there is a token from the buffer, or null if it is
+ * exhausted.
+ */
+ private boolean incrementTokenBuffer() {
+ int start = breaker.current();
+ if (start == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ // find the next set of boundaries, skipping over non-tokens (rule status 0)
+ int end = breaker.next();
+ while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+ start = end;
+ end = breaker.next();
+ }
+
+ if (start == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ termAtt.copyBuffer(buffer, start, end - start);
+ offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
+ typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
+ scriptAtt.setCode(breaker.getScriptCode());
+
+ return true;
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\ICUTokenizer.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java (revision 0)
@@ -0,0 +1,32 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * Class that allows for tailored Unicode Text Segmentation on
+ * a per-writing system basis.
+ */
+public abstract class ICUTokenizerConfig {
+ /** Return a breakiterator capable of processing a given script. */
+ abstract BreakIterator getTokenizer(int script);
+ /** Return a token type value for a given script and BreakIterator
+ * rule status. */
+ abstract String getType(int script, int ruleStatus);
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\ICUTokenizerConfig.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java (revision 0)
@@ -0,0 +1,225 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Syllable iterator for Lao text.
+ *
+ * This breaks Lao text into syllables according to:
+ * Syllabification of Lao Script for Line Breaking
+ * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
+ * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
+ *
+ * - http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+ *
- http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+ *
+ *
+ * Most work is accomplished with RBBI rules, however some additional special logic is needed
+ * that cannot be coded in a grammar, and this is implemented here.
+ *
+ * For example, what appears to be a final consonant might instead be part of the next syllable.
+ * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
+ *
+ * Take for instance the text ກວ່າດອກ
+ * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
+ * What LaoBreakIterator does, according to the paper:
+ *
+ * - backtrack and remove the ດ from the last syllable, placing it on the current syllable.
+ *
- verify the modified previous syllable (ກວ່າ ) is still legal.
+ *
- verify the modified current syllable (ດອກ) is now legal.
+ *
- If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
+ *
+ *
+ * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
+ * This is the issue of combining marks being in the wrong order (typos).
+ */
+public class LaoBreakIterator extends BreakIterator {
+ RuleBasedBreakIterator rules;
+ CharArrayIterator text;
+
+ CharArrayIterator working = new CharArrayIterator();
+ int workingOffset = 0;
+
+ CharArrayIterator verifyText = new CharArrayIterator();
+ RuleBasedBreakIterator verify;
+
+ private static final UnicodeSet laoSet;
+ static {
+ laoSet = new UnicodeSet("[:Lao:]");
+ laoSet.compact();
+ laoSet.freeze();
+ }
+
+ public LaoBreakIterator(RuleBasedBreakIterator rules) {
+ this.rules = (RuleBasedBreakIterator) rules.clone();
+ this.verify = (RuleBasedBreakIterator) rules.clone();
+ }
+
+ @Override
+ public int current() {
+ int current = rules.current();
+ return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
+ }
+
+ @Override
+ public int first() {
+ working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+ rules.setText(working);
+ workingOffset = 0;
+ int first = rules.first();
+ return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
+ }
+
+ @Override
+ public int following(int offset) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public int last() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int next() {
+ int current = current();
+ int next = rules.next();
+ if (next == BreakIterator.DONE)
+ return next;
+ else
+ next += workingOffset;
+
+ char c = working.current();
+ int following = rules.next(); // lookahead
+ if (following != BreakIterator.DONE) {
+ following += workingOffset;
+ if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
+ workingOffset = next - 1;
+ working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
+ return next - 1;
+ }
+ rules.previous(); // undo the lookahead
+ }
+
+ return next;
+ }
+
+ @Override
+ public int next(int n) {
+ if (n < 0)
+ throw new UnsupportedOperationException("Backwards traversal is unsupported");
+
+ int result = current();
+ while (n > 0) {
+ result = next();
+ --n;
+ }
+ return result;
+ }
+
+ @Override
+ public int previous() {
+ throw new UnsupportedOperationException("Backwards traversal is unsupported");
+ }
+
+ @Override
+ public void setText(CharacterIterator text) {
+ if (!(text instanceof CharArrayIterator))
+ throw new UnsupportedOperationException("unsupported CharacterIterator");
+ this.text = (CharArrayIterator) text;
+ ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
+ working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+ rules.setText(working);
+ workingOffset = 0;
+ }
+
+ @Override
+ public void setText(String newText) {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText(newText.toCharArray(), 0, newText.length());
+ setText(ci);
+ }
+
+ private boolean verifyPushBack(int current, int next) {
+ int shortenedSyllable = next - current - 1;
+
+ verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
+ verify.setText(verifyText);
+ if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
+ return false;
+
+
+ verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
+ verify.setText(verifyText);
+
+ return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
+ }
+
+ // TODO: only bubblesort around runs of combining marks, instead of the entire text.
+ private void ccReorder(char[] text, int start, int length) {
+ boolean reordered;
+ do {
+ int prevCC = 0;
+ reordered = false;
+ for (int i = start; i < start + length; i++) {
+ final char c = text[i];
+ final int cc = UCharacter.getCombiningClass(c);
+ if (cc > 0 && cc < prevCC) {
+ // swap
+ text[i] = text[i - 1];
+ text[i - 1] = c;
+ reordered = true;
+ } else {
+ prevCC = cc;
+ }
+ }
+
+ } while (reordered == true);
+ }
+
+ /**
+ * Clone method. Creates another LaoBreakIterator with the same behavior
+ * and current state as this one.
+ * @return The clone.
+ */
+ @Override
+ public Object clone() {
+ LaoBreakIterator other = (LaoBreakIterator) super.clone();
+ other.rules = (RuleBasedBreakIterator) rules.clone();
+ other.verify = (RuleBasedBreakIterator) verify.clone();
+ if (text != null)
+ other.text = (CharArrayIterator) text.clone();
+ if (working != null)
+ other.working = (CharArrayIterator) working.clone();
+ if (verifyText != null)
+ other.verifyText = (CharArrayIterator) verifyText.clone();
+ return other;
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\LaoBreakIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html (revision 0)
@@ -0,0 +1,22 @@
+
+
+
+
+Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm.
+
+
\ No newline at end of file
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\package.html
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java (revision 0)
@@ -0,0 +1,169 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Copyright (C) 1999-2010, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+ * Software, and to permit persons to whom the Software is furnished to do so,
+ * provided that the above copyright notice(s) and this permission notice appear
+ * in all copies of the Software and that both the above copyright notice(s) and
+ * this permission notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
+ * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+ * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization of the
+ * copyright holder.
+ */
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * An iterator that locates ISO 15924 script boundaries in text.
+ *
+ * This is not the same as simply looking at the Unicode block, or even the
+ * Script property. Some characters are 'common' across multiple scripts, and
+ * some 'inherit' the script value of text surrounding them.
+ *
+ * This is similar to ICU (internal-only) UScriptRun, with the following
+ * differences:
+ *
+ * - Doesn't attempt to match paired punctuation. For tokenization purposes, this
+ * is not necessary. Its also quite expensive.
+ *
- Non-spacing marks inherit the script of their base character, following
+ * recommendations from UTR #24.
+ *
+ */
+final class ScriptIterator {
+ private char text[];
+ private int start;
+ private int limit;
+ private int index;
+
+ private int scriptStart;
+ private int scriptLimit;
+ private int scriptCode;
+
+ /**
+ * Get the start of this script run
+ *
+ * @return start position of script run
+ */
+ int getScriptStart() {
+ return scriptStart;
+ }
+
+ /**
+ * Get the index of the first character after the end of this script run
+ *
+ * @return position of the first character after this script run
+ */
+ int getScriptLimit() {
+ return scriptLimit;
+ }
+
+ /**
+ * Get the UScript script code for this script run
+ *
+ * @return code for the script of the current run
+ */
+ int getScriptCode() {
+ return scriptCode;
+ }
+
+ /**
+ * Iterates to the next script run, returning true if one exists.
+ *
+ * @return true if there is another script run, false otherwise.
+ */
+ boolean next() {
+ if (scriptLimit >= limit)
+ return false;
+
+ scriptCode = UScript.COMMON;
+ scriptStart = scriptLimit;
+
+ while (index < limit) {
+ final int ch = UTF16.charAt(text, start, limit, index - start);
+ final int sc = getScript(ch);
+
+ /*
+ * From UTR #24: Implementations that determine the boundaries between
+ * characters of given scripts should never break between a non-spacing
+ * mark and its base character. Thus for boundary determinations and
+ * similar sorts of processing, a non-spacing mark — whatever its script
+ * value — should inherit the script value of its base character.
+ */
+ if (isSameScript(scriptCode, sc)
+ || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
+ index += UTF16.getCharCount(ch);
+
+ /*
+ * Inherited or Common becomes the script code of the surrounding text.
+ */
+ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
+ scriptCode = sc;
+ }
+
+ } else {
+ break;
+ }
+ }
+
+ scriptLimit = index;
+ return true;
+ }
+
+ /** Determine if two scripts are compatible. */
+ private static boolean isSameScript(int scriptOne, int scriptTwo) {
+ return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
+ || scriptOne == scriptTwo;
+ }
+
+ /**
+ * Set a new region of text to be examined by this iterator
+ *
+ * @param text text buffer to examine
+ * @param start offset into buffer
+ * @param length maximum length to examine
+ */
+ void setText(char text[], int start, int length) {
+ this.text = text;
+ this.start = start;
+ this.index = start;
+ this.limit = start + length;
+ this.scriptStart = start;
+ this.scriptLimit = start;
+ this.scriptCode = UScript.INVALID_CODE;
+ }
+
+ /** linear fast-path for basic latin case */
+ private static final int basicLatin[] = new int[128];
+
+ static {
+ for (int i = 0; i < basicLatin.length; i++)
+ basicLatin[i] = UScript.getScript(i);
+ }
+
+ /** fast version of UScript.getScript(). Basic Latin is an array lookup */
+ private static int getScript(int codepoint) {
+ if (0 <= codepoint && codepoint < basicLatin.length)
+ return basicLatin[codepoint];
+ else
+ return UScript.getScript(codepoint);
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\segmentation\ScriptIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java (revision 0)
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.icu.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+import com.ibm.icu.lang.UScript; // javadoc @link
+
+/**
+ * This attribute stores the UTR #24 script value for a token of text.
+ */
+public interface ScriptAttribute extends Attribute {
+ /**
+ * Get the numeric code for this script value.
+ * This is the constant value from {@link UScript}.
+ * @return numeric code
+ */
+ public int getCode();
+ /**
+ * Set the numeric code for this script value.
+ * This is the constant value from {@link UScript}.
+ * @param code numeric code
+ */
+ public void setCode(int code);
+ /**
+ * Get the full name.
+ * @return UTR #24 full name.
+ */
+ public String getName();
+ /**
+ * Get the abbreviated name.
+ * @return UTR #24 abbreviated name.
+ */
+ public String getShortName();
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\tokenattributes\ScriptAttribute.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (revision 0)
+++ contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (revision 0)
@@ -0,0 +1,78 @@
+package org.apache.lucene.analysis.icu.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.AttributeImpl;
+
+import com.ibm.icu.lang.UScript;
+
+public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable, Serializable {
+ private int code = UScript.COMMON;
+
+ public int getCode() {
+ return code;
+ }
+
+ public void setCode(int code) {
+ this.code = code;
+ }
+
+ public String getName() {
+ return UScript.getName(code);
+ }
+
+ public String getShortName() {
+ return UScript.getShortName(code);
+ }
+
+ @Override
+ public void clear() {
+ code = UScript.COMMON;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ScriptAttribute t = (ScriptAttribute) target;
+ t.setCode(code);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+
+ if (other instanceof ScriptAttributeImpl) {
+ return ((ScriptAttributeImpl) other).code == code;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return code;
+ }
+
+ @Override
+ public String toString() {
+ return "script=" + getName();
+ }
+}
Property changes on: contrib\icu\src\java\org\apache\lucene\analysis\icu\tokenattributes\ScriptAttributeImpl.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib\icu\src\resources\org\apache\lucene\analysis\icu\segmentation\Hebrew.brk
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib\icu\src\resources\org\apache\lucene\analysis\icu\segmentation\Khmer.brk
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib\icu\src\resources\org\apache\lucene\analysis\icu\segmentation\Lao.brk
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib\icu\src\resources\org\apache\lucene\analysis\icu\segmentation\Myanmar.brk
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java (revision 0)
+++ contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java (revision 0)
@@ -0,0 +1,109 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayIterator extends LuceneTestCase {
+ public void testBasicUsage() {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ assertEquals(0, ci.getBeginIndex());
+ assertEquals(7, ci.getEndIndex());
+ assertEquals(0, ci.getIndex());
+ assertEquals('t', ci.current());
+ assertEquals('e', ci.next());
+ assertEquals('g', ci.last());
+ assertEquals('n', ci.previous());
+ assertEquals('t', ci.first());
+ assertEquals(CharacterIterator.DONE, ci.previous());
+ }
+
+ public void testFirst() {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ ci.next();
+ // Sets the position to getBeginIndex() and returns the character at that position.
+ assertEquals('t', ci.first());
+ assertEquals(ci.getBeginIndex(), ci.getIndex());
+ // or DONE if the text is empty
+ ci.setText(new char[] {}, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.first());
+ }
+
+ public void testLast() {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
+ // and returns the character at that position.
+ assertEquals('g', ci.last());
+ assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
+ // or DONE if the text is empty
+ ci.setText(new char[] {}, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.last());
+ assertEquals(ci.getEndIndex(), ci.getIndex());
+ }
+
+ public void testCurrent() {
+ CharArrayIterator ci = new CharArrayIterator();
+ // Gets the character at the current position (as returned by getIndex()).
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ assertEquals('t', ci.current());
+ ci.last();
+ ci.next();
+ // or DONE if the current position is off the end of the text.
+ assertEquals(CharacterIterator.DONE, ci.current());
+ }
+
+ public void testNext() {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText("te".toCharArray(), 0, 2);
+ // Increments the iterator's index by one and returns the character at the new index.
+ assertEquals('e', ci.next());
+ assertEquals(1, ci.getIndex());
+ // or DONE if the new position is off the end of the text range.
+ assertEquals(CharacterIterator.DONE, ci.next());
+ assertEquals(ci.getEndIndex(), ci.getIndex());
+ }
+
+ public void testSetIndex() {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText("test".toCharArray(), 0, "test".length());
+ try {
+ ci.setIndex(5);
+ fail();
+ } catch (Exception e) {
+ assertTrue(e instanceof IllegalArgumentException);
+ }
+ }
+
+ public void testClone() {
+ char text[] = "testing".toCharArray();
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText(text, 0, text.length);
+ ci.next();
+ CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
+ assertEquals(ci.getIndex(), ci2.getIndex());
+ assertEquals(ci.next(), ci2.next());
+ assertEquals(ci.last(), ci2.last());
+ }
+
+
+}
Property changes on: contrib\icu\src\test\org\apache\lucene\analysis\icu\segmentation\TestCharArrayIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (revision 0)
+++ contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (revision 0)
@@ -0,0 +1,212 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+
+import java.util.Arrays;
+
+public class TestICUTokenizer extends BaseTokenStreamTestCase {
+
+ public void testHugeDoc() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ char whitespace[] = new char[4094];
+ Arrays.fill(whitespace, ' ');
+ sb.append(whitespace);
+ sb.append("testing 1234");
+ String input = sb.toString();
+ ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+ assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+ }
+
+ public void testHugeTerm2() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 40960; i++) {
+ sb.append('a');
+ }
+ String input = sb.toString();
+ ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+ char token[] = new char[4096];
+ Arrays.fill(token, 'a');
+ String expectedToken = new String(token);
+ String expected[] = {
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken
+ };
+ assertTokenStreamContents(tokenizer, expected);
+ }
+
+ private Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer tokenizer = new ICUTokenizer(reader);
+ TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ public void testArmenian() throws Exception {
+ assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
+ new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
+ "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
+ }
+
+ public void testAmharic() throws Exception {
+ assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
+ new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
+ }
+
+ public void testArabic() throws Exception {
+ assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
+ new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+ "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
+ }
+
+ public void testAramaic() throws Exception {
+ assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+ new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+ "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
+ }
+
+ public void testBengali() throws Exception {
+ assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
+ new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
+ "শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
+ }
+
+ public void testFarsi() throws Exception {
+ assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
+ new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
+ "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
+ }
+
+ public void testGreek() throws Exception {
+ assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
+ new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+ "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
+ }
+
+ public void testLao() throws Exception {
+ assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
+ }
+
+ public void testThai() throws Exception {
+ assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
+ new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
+ }
+
+ public void testTibetan() throws Exception {
+ assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+ new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+ }
+
+ /*
+ * For chinese, tokenize as char (these can later form bigrams or whatever)
+ * TODO: why do full-width numerics have no word-break prop?
+ */
+ public void testChinese() throws Exception {
+ assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
+ new String[] { "我", "是", "中", "国", "人", "tests"});
+ }
+
+ public void testEmpty() throws Exception {
+ assertAnalyzesTo(a, "", new String[] {});
+ assertAnalyzesTo(a, ".", new String[] {});
+ assertAnalyzesTo(a, " ", new String[] {});
+ }
+
+ /* test various jira issues this analyzer is related to */
+
+ public void testLUCENE1545() throws Exception {
+ /*
+ * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
+ * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
+ * Expected result is only on token "moͤchte".
+ */
+ assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
+ }
+
+ /* Tests from StandardAnalyzer, just to show behavior is similar */
+ public void testAlphanumericSA() throws Exception {
+ // alphanumeric tokens
+ assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+ assertAnalyzesTo(a, "2B", new String[]{"2b"});
+ }
+
+ public void testDelimitersSA() throws Exception {
+ // other delimiters: "-", "/", ","
+ assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+ assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+ assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+ }
+
+ public void testApostrophesSA() throws Exception {
+ // internal apostrophes: O'Reilly, you're, O'Reilly's
+ assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+ assertAnalyzesTo(a, "you're", new String[]{"you're"});
+ assertAnalyzesTo(a, "she's", new String[]{"she's"});
+ assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
+ assertAnalyzesTo(a, "don't", new String[]{"don't"});
+ assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
+ }
+
+ public void testNumericSA() throws Exception {
+ // floating point, serial, model numbers, ip addresses, etc.
+ // every other segment must have at least one digit
+ assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+ assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+ assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ }
+
+ public void testTextWithNumbersSA() throws Exception {
+ // numbers
+ assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+ }
+
+ public void testVariousTextSA() throws Exception {
+ // various
+ assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
+ assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+ assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+ assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+ }
+
+ public void testKoreanSA() throws Exception {
+ // Korean words
+ assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+ }
+
+ public void testReusableTokenStream() throws Exception {
+ assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+ new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང",
+ "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+ }
+}
Property changes on: contrib\icu\src\test\org\apache\lucene\analysis\icu\segmentation\TestICUTokenizer.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java (revision 0)
+++ contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java (revision 0)
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Tests LaoBreakIterator and its RBBI rules
+ */
+public class TestLaoBreakIterator extends LuceneTestCase {
+ private BreakIterator wordIterator;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ InputStream is = getClass().getResourceAsStream("Lao.brk");
+ wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
+ is.close();
+ }
+
+ private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
+ char text[] = sourceText.toCharArray();
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText(text, 0, text.length);
+ iterator.setText(ci);
+
+ for (int i = 0; i < tokens.length; i++) {
+ int start, end;
+ do {
+ start = iterator.current();
+ end = iterator.next();
+ } while (end != BreakIterator.DONE && !isWord(text, start, end));
+ assertTrue(start != BreakIterator.DONE);
+ assertTrue(end != BreakIterator.DONE);
+ assertEquals(tokens[i], new String(text, start, end - start));
+ }
+
+ assertTrue(iterator.next() == BreakIterator.DONE);
+ }
+
+ protected boolean isWord(char text[], int start, int end) {
+ int codepoint;
+ for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
+ codepoint = UTF16.charAt(text, 0, end, start);
+
+ if (UCharacter.isLetterOrDigit(codepoint))
+ return true;
+ }
+
+ return false;
+ }
+
+ public void testBasicUsage() throws Exception {
+ assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
+ assertBreaksTo(wordIterator, "ຜູ້ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
+ assertBreaksTo(wordIterator, "", new String[] {});
+ assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
+ }
+
+ public void testNumerics() throws Exception {
+ assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
+ assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
+ }
+
+ public void testTextAndNumerics() throws Exception {
+ assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
+ }
+}
Property changes on: contrib\icu\src\test\org\apache\lucene\analysis\icu\segmentation\TestLaoBreakIterator.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java
===================================================================
--- contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java (revision 0)
+++ contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java (revision 0)
@@ -0,0 +1,101 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.lang.reflect.Method;
+
+import com.ibm.icu.text.RuleBasedBreakIterator;
+
+/**
+ * Command-line utility to converts RuleBasedBreakIterator (.rbbi) files into
+ * binary compiled form (.brk).
+ */
+public class RBBIRuleCompiler {
+
+ static String getRules(File ruleFile) throws IOException {
+ StringBuilder rules = new StringBuilder();
+ InputStream in = new FileInputStream(ruleFile);
+ BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+ String line = null;
+ while ((line = cin.readLine()) != null) {
+ if (!line.startsWith("#"))
+ rules.append(line);
+ rules.append('\n');
+ }
+ cin.close();
+ in.close();
+ return rules.toString();
+ }
+
+ static void compile(File srcDir, File destDir) throws Exception {
+ File files[] = srcDir.listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.endsWith("rbbi");
+ }});
+ if (files == null) throw new IOException("Path does not exist: " + srcDir);
+ for (int i = 0; i < files.length; i++) {
+ File file = files[i];
+ File outputFile = new File(destDir,
+ file.getName().replaceAll("rbbi$", "brk"));
+ String rules = getRules(file);
+ System.err.print("Compiling " + file.getName() + " to "
+ + outputFile.getName() + ": ");
+ /*
+ * if there is a syntax error, compileRules() may succeed. the way to
+ * check is to try to instantiate from the string. additionally if the
+ * rules are invalid, you can get a useful syntax error.
+ */
+ try {
+ new RuleBasedBreakIterator(rules);
+ } catch (IllegalArgumentException e) {
+ /*
+ * do this intentionally, so you don't get a massive stack trace
+ * instead, get a useful syntax error!
+ */
+ System.err.println(e.getMessage());
+ System.exit(1);
+ }
+ FileOutputStream os = new FileOutputStream(outputFile);
+ // RBBIRuleBuilder.compileRules(rules, os);
+ Class> builderClass = Class.forName("com.ibm.icu.text.RBBIRuleBuilder");
+ Method method = builderClass.getDeclaredMethod("compileRules", String.class, OutputStream.class);
+ method.setAccessible(true);
+ method.invoke(null, rules, os);
+ os.close();
+ System.err.println(outputFile.length() + " bytes.");
+ }
+ }
+
+ public static void main(String args[]) throws Exception {
+ if (args.length < 2) {
+ System.err.println("Usage: RBBIRuleComputer ");
+ System.exit(1);
+ }
+ compile(new File(args[0]), new File(args[1]));
+ System.exit(0);
+ }
+}
Property changes on: contrib\icu\src\tools\java\org\apache\lucene\analysis\icu\RBBIRuleCompiler.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native