Index: lucene/common-build.xml =================================================================== --- lucene/common-build.xml (revision 991408) +++ lucene/common-build.xml (working copy) @@ -640,8 +640,8 @@ source="@{javac.source}" target="@{javac.target}"> - - + Index: lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 991408) +++ lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -247,8 +247,8 @@ public void testCJK() throws Exception { // Test Ideographic Space - As wide as a CJK character cell (fullwidth) // used google to translate the word "term" to japanese -> 用語 - assertQueryEquals("term\u3000term\u3000term", null, "term\u0020term\u0020term"); - assertQueryEquals("用語\u3000用語\u3000用語", null, "用語\u0020用語\u0020用語"); + assertQueryEquals("term\u3000term\u3000term", new MockAnalyzer(), "term\u0020term\u0020term"); + assertQueryEquals("用語\u3000用語\u3000用語", new MockAnalyzer(), "用語\u0020用語\u0020用語"); } //individual CJK chars as terms, like StandardAnalyzer Index: lucene/src/test/org/apache/lucene/analysis/MockTokenizer.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/MockTokenizer.java (revision 991408) +++ lucene/src/test/org/apache/lucene/analysis/MockTokenizer.java (working copy) @@ -37,10 +37,10 @@ */ public static final CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton(new RegExp(".*").toAutomaton()); - /** Acts like LetterTokenizer. */ - // the ugly regex below is Unicode 5.2 [:Letter:] + /** Acts like LetterTokenizer, but only on latin1. + * a full isCharacter(Letter) regexp is huge and slow to load */ public static final CharacterRunAutomaton SIMPLE = - new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬˮͰ-ʹͶͷͺ-ͽΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԥԱ-Ֆՙա-ևא-תװ-ײء-يٮٯٱ-ۓەۥۦۮۯۺ-ۼۿܐܒ-ܯݍ-ޥޱߊ-ߪߴߵߺࠀ-ࠕࠚࠤࠨऄ-हऽॐक़-ॡॱॲॹ-ॿঅ-ঌএঐও-নপ-রলশ-হঽৎড়ঢ়য়-ৡৰৱਅ-ਊਏਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલળવ-હઽૐૠૡଅ-ଌଏଐଓ-ନପ-ରଲଳଵ-ହଽଡ଼ଢ଼ୟ-ୡୱஃஅ-ஊஎ-ஐஒ-கஙசஜஞடணதந-பம-ஹௐఅ-ఌఎ-ఐఒ-నప-ళవ-హఽౘౙౠౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽೞೠೡഅ-ഌഎ-ഐഒ-നപ-ഹഽൠൡൺ-ൿඅ-ඖක-නඳ-රලව-ෆก-ะาำเ-ๆກຂຄງຈຊຍດ-ທນ-ຟມ-ຣລວສຫອ-ະາຳຽເ-ໄໆໜໝༀཀ-ཇཉ-ཬྈ-ྋက-ဪဿၐ-ၕၚ-ၝၡၥၦၮ-ၰၵ-ႁႎႠ-Ⴥა-ჺჼᄀ-ቈቊ-ቍቐ-ቖቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗៜᠠ-ᡷᢀ-ᢨᢪᢰ-ᣵᤀ-ᤜᥐ-ᥭᥰ-ᥴᦀ-ᦫᧁ-ᧇᨀ-ᨖᨠ-ᩔᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮᮯᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱⁿₐ-ₔℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⴀ-ⴥⴰ-ⵥⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ々〆〱-〵〻〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆷㇰ-ㇿ㐀-䶵一-鿋ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪꘫꙀ-ꙟꙢ-ꙮꙿ-ꚗꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋꞌꟻ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺꪀ-ꪯꪱꪵꪶꪹ-ꪽꫀꫂꫛ-ꫝꯀ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-鶴侮-舘並-龎ff-stﬓ-ﬗיִײַ-ﬨשׁ-זּטּ-לּמּנּסּףּפּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌞𐌰-𐍀𐍂-𐍉𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐠀-𐠅𐠈𐠊-𐠵𐠷𐠸𐠼𐠿-𐡕𐤀-𐤕𐤠-𐤹𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐬀-𐬵𐭀-𐭕𐭠-𐭲𐰀-𐱈𑂃-𑂯𒀀-𒍮𓀀-𓐮𝐀-𝑔𝑖-𝒜𝒞𝒟𝒢𝒥𝒦𝒩-𝒬𝒮-𝒹𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𠀀-𪛖𪜀-𫜴丽-𪘀]+").toAutomaton()); + new CharacterRunAutomaton(new RegExp("[a-zA-Z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]+").toAutomaton()); private final CharacterRunAutomaton runAutomaton; private final boolean lowerCase; Index: lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 991408) +++ lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (working copy) @@ -47,7 +47,7 @@ } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.WHITESPACE, true); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); } @@ -64,7 +64,7 @@ builder.append("a"); } builder.append("\ud801\udc1cabc"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.WHITESPACE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); } } @@ -78,7 +78,7 @@ for (int i = 0; i < 255; i++) { builder.append("A"); } - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.WHITESPACE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } @@ -92,7 +92,7 @@ builder.append("A"); } builder.append("\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.WHITESPACE, true); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } Index: lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java (revision 991408) +++ lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java (working copy) @@ -43,7 +43,7 @@ try { java4.codePointAt(highSurrogateAt3, 4); fail("array index out of bounds"); - } catch (ArrayIndexOutOfBoundsException e) { + } catch (IndexOutOfBoundsException e) { } CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT); @@ -54,7 +54,7 @@ try { java5.codePointAt(highSurrogateAt3, 4); fail("array index out of bounds"); - } catch (ArrayIndexOutOfBoundsException e) { + } catch (IndexOutOfBoundsException e) { } } @@ -69,7 +69,7 @@ try { java4.codePointAt(highSurrogateAt3, 4); fail("string index out of bounds"); - } catch (StringIndexOutOfBoundsException e) { + } catch (IndexOutOfBoundsException e) { } CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT); @@ -80,7 +80,7 @@ try { java5.codePointAt(highSurrogateAt3, 4); fail("string index out of bounds"); - } catch (StringIndexOutOfBoundsException e) { + } catch (IndexOutOfBoundsException e) { } }