Index: modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 996902) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy) @@ -31,10 +31,10 @@ * testcase for offsets */ public void testOffsets() throws Exception { - assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "เดอะนิวยอร์กไทมส์", - new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"}, - new int[] { 0, 2, 7, 9, 12 }, - new int[] { 2, 7, 9, 12, 17}); + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, + new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, + new int[] { 3, 6, 9, 13, 17, 20, 23, 25 }); } @@ -49,16 +49,18 @@ * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks! */ public void testBuggyTokenType() throws Exception { - assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "เดอะนิวยอร์กไทมส์ ๑๒๓", - new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, - new String[] { "", "", "", "", "", "" }); + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, + new String[] { "", "", "", "", "", + "", "", "", "" }); } /* correct testcase public void testTokenType() throws Exception { - assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", - new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" }, - new String[] { "", "", "", "", "", "" }); + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, + new String[] { "", "", "", "", "", + "", "", "", "" }); } */ @@ -90,18 +92,18 @@ public void testPositionIncrements() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT); - assertAnalyzesTo(analyzer, "ประโยคว่า the ประโยคว่า", - new String[] { "ประโยค", "ว่า", "ประโยค", "ว่า" }, - new int[] { 0, 6, 14, 20 }, - new int[] { 6, 9, 20, 23 }, - new int[] { 1, 1, 2, 1 }); + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้อง the แสดงว่างานดี", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, + new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, + new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, + new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); // case that a stopword is adjacent to thai text, with no whitespace - assertAnalyzesTo(analyzer, "ประโยคว่าtheประโยคว่า", - new String[] { "ประโยค", "ว่า", "ประโยค", "ว่า" }, - new int[] { 0, 6, 12, 18 }, - new int[] { 6, 9, 18, 21 }, - new int[] { 1, 1, 2, 1 }); + assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องthe แสดงว่างานดี", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, + new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, + new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, + new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); } public void testReusableTokenStream() throws Exception { Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (revision 996902) +++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (working copy) @@ -25,7 +25,11 @@ public class TestCollationKeyAnalyzer extends CollationTestBase { - + // the sort order of Ø versus U depends on the version of the rules being used + // for the inherited root locale: Ø's order isnt specified in Locale.US since + // its not used in english. + private boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0; + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi // characters properly. @@ -69,9 +73,9 @@ Analyzer denmarkAnalyzer = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk"))); - // The ICU Collator and java.text.Collator implementations differ in their + // The ICU Collator and Sun java.text.Collator implementations differ in their // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US. testCollationKeySort - (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, oStrokeFirst ? "BFJHD" : "BFJDH"); } } Index: modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (revision 996902) +++ modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (working copy) @@ -28,7 +28,11 @@ public class TestCollationKeyFilter extends CollationTestBase { - + // the sort order of Ø versus U depends on the version of the rules being used + // for the inherited root locale: Ø's order isnt specified in Locale.US since + // its not used in english. + boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0; + // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi // characters properly. @@ -87,9 +91,9 @@ Analyzer denmarkAnalyzer = new TestAnalyzer(Collator.getInstance(new Locale("da", "dk"))); - // The ICU Collator and java.text.Collator implementations differ in their + // The ICU Collator and Sun java.text.Collator implementations differ in their // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US. testCollationKeySort - (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJDH"); + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, oStrokeFirst ? "BFJHD" : "BFJDH"); } } Index: lucene/src/test/org/apache/lucene/search/TestSort.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSort.java (revision 996902) +++ lucene/src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.Serializable; +import java.text.Collator; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; @@ -94,6 +95,11 @@ { "Z", "f g", null, null, null, null, null, null, null, null, null, null} }; + // the sort order of Ø versus U depends on the version of the rules being used + // for the inherited root locale: Ø's order isnt specified in Locale.US since + // its not used in english. + private boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0; + // create an index of all the documents, or just the x, or just the y documents private IndexSearcher getIndex (boolean even, boolean odd) throws IOException { @@ -595,7 +601,7 @@ // (which sort differently depending on locale) public void testInternationalSort() throws Exception { sort.setSort (new SortField ("i18n", Locale.US)); - assertMatches (full, queryY, sort, "BFJDH"); + assertMatches (full, queryY, sort, oStrokeFirst ? "BFJHD" : "BFJDH"); sort.setSort (new SortField ("i18n", new Locale("sv", "se"))); assertMatches (full, queryY, sort, "BJDFH"); @@ -619,7 +625,7 @@ assertMatches (multiSearcher, queryY, sort, "BJDFH"); sort.setSort (new SortField ("i18n", Locale.US)); - assertMatches (multiSearcher, queryY, sort, "BFJDH"); + assertMatches (multiSearcher, queryY, sort, oStrokeFirst ? "BFJHD" : "BFJDH"); sort.setSort (new SortField ("i18n", new Locale("da", "dk"))); assertMatches (multiSearcher, queryY, sort, "BJDHF");