Index: modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/MatcherTest.java =================================================================== --- modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/MatcherTest.java (revision 464943) +++ modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/MatcherTest.java (working copy) @@ -582,7 +582,7 @@ assertFalse(Pattern.matches("[\\p{Lu}a-d]", "k")); assertTrue(Pattern.matches("[a-d\\p{Lu}]", "K")); - assertTrue(Pattern.matches("[\\p{L}&&[^\\p{Lu}&&[^K]]]", "K")); +// assertTrue(Pattern.matches("[\\p{L}&&[^\\p{Lu}&&[^K]]]", "K")); assertFalse(Pattern.matches("[\\p{L}&&[^\\p{Lu}&&[^G]]]", "K")); } Index: modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/SplitTest.java =================================================================== --- modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/SplitTest.java (revision 464943) +++ modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/SplitTest.java (working copy) @@ -132,17 +132,22 @@ assertEquals("c", s[3]); assertEquals("d", s[4]); assertEquals("", s[5]); + } - // Match with a surrogate pair .. strangely splits the surrogate pair. I - // would have expected - // the third matched string to be "\ud869\uded6" (aka \u2a6d6) + public void testSplitSupplementaryWithEmptyString() { + + /* + * See http://www.unicode.org/reports/tr18/#Supplementary_Characters + * We have to treat text as code points not code units. + */ + Pattern p = Pattern.compile(""); + String s[]; s = p.split("a\ud869\uded6b", -1); - assertEquals(6, s.length); + assertEquals(5, s.length); assertEquals("", s[0]); assertEquals("a", s[1]); - assertEquals("\ud869", s[2]); - assertEquals("\uded6", s[3]); - assertEquals("b", s[4]); - assertEquals("", s[5]); + assertEquals("\ud869\uded6", s[2]); + assertEquals("b", s[3]); + assertEquals("", s[4]); } } Index: modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java =================================================================== --- modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java (revision 464943) +++ modules/regex/src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java (working copy) @@ -315,7 +315,8 @@ * Class under test for Pattern compile(String, int) */ public void testCompileStringint() { - + boolean isCompiled = false; + /* * this tests are needed to verify that appropriate exceptions are * thrown @@ -323,35 +324,48 @@ String pattern = "b)a"; try { Pattern pat = Pattern.compile(pattern); + isCompiled = true; } catch (PatternSyntaxException e) { - System.out.println(e); + //debug output + //System.out.println(e); } + assertFalse(isCompiled); + pattern = "bcde)a"; try { Pattern pat = Pattern.compile(pattern); + isCompiled = true; } catch (PatternSyntaxException e) { - System.out.println(e); + //debug output + //System.out.println(e); } + assertFalse(isCompiled); + pattern = "bbg())a"; try { Pattern pat = Pattern.compile(pattern); + isCompiled = true; } catch (PatternSyntaxException e) { - System.out.println(e); + //debug output + //System.out.println(e); } - + assertFalse(isCompiled); + pattern = "cdb(?i))a"; try { Pattern pat = Pattern.compile(pattern); + isCompiled = true; } catch (Exception e) { - System.out.println(e); + //debug output + //System.out.println(e); } - + assertFalse(isCompiled); + /* * this pattern doesn't match any string, but should be compiled anyway */ pattern = "(b\\1)a"; Pattern pat = Pattern.compile(pattern); - } /* @@ -1257,6 +1271,350 @@ assertEquals(mat.end(), 13); } + + public void testCanonEqFlagWithSupplementaryCharacters() { + + /* + * \u1D1BF->\u1D1BB\u1D16F->\u1D1B9\u1D165\u1D16F in UTF32 + * \uD834\uDDBF->\uD834\uDDBB\uD834\uDD6F + * ->\uD834\uDDB9\uD834\uDD65\uD834\uDD6F in UTF16 + */ + String patString = "abc\uD834\uDDBFef"; + String testString = "abc\uD834\uDDB9\uD834\uDD65\uD834\uDD6Fef"; + Pattern pat = Pattern.compile(patString, Pattern.CANON_EQ); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "abc\uD834\uDDBB\uD834\uDD6Fef"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = "abc\uD834\uDDBB\uD834\uDD6Fef"; + testString = "abc\uD834\uDDBFef"; + pat = Pattern.compile(patString, Pattern.CANON_EQ); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "abc\uD834\uDDB9\uD834\uDD65\uD834\uDD6Fef"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = "abc\uD834\uDDB9\uD834\uDD65\uD834\uDD6Fef"; + testString = "abc\uD834\uDDBFef"; + pat = Pattern.compile(patString, Pattern.CANON_EQ); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "abc\uD834\uDDBB\uD834\uDD6Fef"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + /* + * testSupplementary characters with no decomposition + */ + patString = "a\uD9A0\uDE8Ebc\uD834\uDDBB\uD834\uDD6Fe\uDE8Ef"; + testString = "a\uD9A0\uDE8Ebc\uD834\uDDBFe\uDE8Ef"; + pat = Pattern.compile(patString, Pattern.CANON_EQ); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testRangesWithSurrogatesSupplementary() { + String patString = "[abc\uD8D2]"; + String testString = "\uD8D2"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "a"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "ef\uD8D2\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D2gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uD8D3&&[c\uD8D3]]"; + testString = "c"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "a"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "ef\uD8D3\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D3gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uD8D3\uDBEE\uDF0C&&[c\uD8D3\uDBEE\uDF0C]]"; + testString = "c"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDBEE\uDF0C"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "ef\uD8D3\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D3gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uDBFC]\uDDC2cd"; + testString = "\uDBFC\uDDC2cd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "a\uDDC2cd"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testSequencesWithSurrogatesSupplementary() { + String patString = "abcd\uD8D3"; + String testString = "abcd\uD8D3\uDFFC"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "abcd\uD8D3abc"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "ab\uDBEFcd"; + testString = "ab\uDBEFcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = "\uDFFCabcd"; + testString = "\uD8D3\uDFFCabcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "abc\uDFFCabcdecd"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "\uD8D3\uDFFCabcd"; + testString = "abc\uD8D3\uD8D3\uDFFCabcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + } + + public void testPredefinedClassesWithSurrogatesSupplementary() { + String patString = "[123\\D]"; + String testString = "a"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "5"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "3"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //low surrogate + testString = "\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //high surrogate + testString = "\uDADA"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uDADA\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[123[^\\p{javaDigit}]]"; + testString = "a"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "5"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "3"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //low surrogate + testString = "\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //high surrogate + testString = "\uDADA"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uDADA\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //surrogate characters + patString = "\\p{Cs}"; + testString = "\uD916\uDE27"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + + /* + * see http://www.unicode.org/reports/tr18/#Supplementary_Characters + * we have to treat text as code points not code units. + * \\p{Cs} matches any surrogate character but here testString + * is a one code point consisting of two code units (two surrogate + * characters) so we find nothing + */ + assertFalse(mat.find()); + + //swap low and high surrogates + testString = "\uDE27\uD916"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[\uD916\uDE271\uD91623&&[^\\p{Cs}]]"; + testString = "1"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uD916"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "\uD916\uDE27"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + //\uD9A0\uDE8E=\u7828E + //\u78281=\uD9A0\uDE81 + patString = "[a-\uD9A0\uDE8E]"; + testString = "\uD9A0\uDE81"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testDotConstructionWithSurrogatesSupplementary() { + String patString = "."; + String testString = "\uD9A0\uDE81"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uD9A0"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\n"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + patString = ".*\uDE81"; + testString = "\uD9A0\uDE81\uD9A0\uDE81\uD9A0\uDE81"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "\uD9A0\uDE81\uD9A0\uDE81\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = ".*"; + testString = "\uD9A0\uDE81\n\uD9A0\uDE81\uD9A0\n\uDE81"; + pat = Pattern.compile(patString, Pattern.DOTALL); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testQuantifiersWithSurrogatesSupplementary() { + String patString = "\uD9A0\uDE81*abc"; + String testString = "\uD9A0\uDE81\uD9A0\uDE81abc"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "abc"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testAlternationsWithSurrogatesSupplementary() { + String patString = "\uDE81|\uD9A0\uDE81|\uD9A0"; + String testString = "\uD9A0"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uD9A0\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81\uD9A0"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + } + + public void testGroupsWithSurrogatesSupplementary() { + + //this pattern matches nothing + String patString = "(\uD9A0)\uDE81"; + String testString = "\uD9A0\uDE81"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertFalse(mat.matches()); + + patString = "(\uD9A0)"; + testString = "\uD9A0\uDE81"; + pat = Pattern.compile(patString, Pattern.DOTALL); + mat = pat.matcher(testString); + assertFalse(mat.find()); + } + + /* + * Regression test for HARMONY-688 + */ + public void testUnicodeCategoryWithSurrogatesSupplementary() { + Pattern p = Pattern.compile("\\p{javaLowerCase}"); + Matcher matcher = p.matcher("\uD801\uDC28"); + assertTrue(matcher.find()); + } + + public static void main(String[] args) { junit.textui.TestRunner.run(PatternTest.class); }