From 88e15af2e6996cc9e4617f3f78303dea25e5859d Mon Sep 17 00:00:00 2001 From: Michael Dodsworth Date: Sat, 28 Jun 2014 20:55:31 -0700 Subject: [PATCH] LUCENE-4730: adding test to verify correct start/end offsets for SmartChineseAnalyzer diff --git lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java index 1e1c17a..67b0580 100644 --- lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java +++ lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java @@ -109,7 +109,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { * English words are lowercased and porter-stemmed. */ public void testMixedLatinChinese() throws Exception { - assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装", + assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true), "我购买 Tests 了道具和服装", new String[] { "我", "购买", "test", "了", "道具", "和", "服装"}); } @@ -164,6 +164,10 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new String[] { "我", "购买", "了", "道具", "和", "服装" }, new int[] { 0, 1, 3, 4, 6, 7 }, new int[] { 1, 3, 4, 6, 7, 9 }); + + // LUCENE-4730 -- fixed in LUCENE-4984 + assertAnalyzesTo(new SmartChineseAnalyzer(Version.LUCENE_4_8, true), "My China ", + new String[] { "my", "china"}, new int[] {0,3}, new int[] {2, 8}); } public void testReusableTokenStream() throws Exception { @@ -231,7 +235,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new int[] { 0 }, new int[] { 11 }); } - + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER); -- 2.0.0