Index: modules/regex/src/main/java/java/util/regex/ReluctantQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/ReluctantQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/ReluctantQuantifierSet.java (working copy) @@ -35,7 +35,6 @@ public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { - int i = 0; int shift = 0; do { Index: modules/regex/src/main/java/java/util/regex/CompositeQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/CompositeQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/CompositeQuantifierSet.java (working copy) @@ -70,7 +70,7 @@ if (shift >= 0) { return shift; } - stringIndex--; + stringIndex -= leaf.charCount(); } return -1; Index: modules/regex/src/main/java/java/util/regex/EmptySet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/EmptySet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/EmptySet.java (working copy) @@ -40,6 +40,70 @@ return 0; } + public int find(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int startStr = matchResult.getLeftBound(); + + while (stringIndex <= strLength) { + + //check for supplementary codepoints + if (stringIndex < strLength) { + char low = testString.charAt(stringIndex); + + if (Character.isLowSurrogate(low)) { + + if (stringIndex > startStr) { + char high = testString.charAt(stringIndex - 1); + if (Character.isHighSurrogate(high)) { + stringIndex++; + continue; + } + } + } + } + + if (next.matches(stringIndex, testString, matchResult) >= 0) { + return stringIndex; + } + stringIndex++; + } + + return -1; + } + + public int findBack(int stringIndex, int startSearch, + CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int startStr = matchResult.getLeftBound(); + + while (startSearch >= stringIndex) { + + //check for supplementary codepoints + if (startSearch < strLength) { + char low = testString.charAt(startSearch); + + if (Character.isLowSurrogate(low)) { + + if (startSearch > startStr) { + char high = testString.charAt(startSearch - 1); + if (Character.isHighSurrogate(high)) { + startSearch--; + continue; + } + } + } + } + + if (next.matches(startSearch, testString, matchResult) >= 0) { + return startSearch; + } + startSearch--; + } + + return -1; + } + /* * @see java.util.regex.AbstractSet#getName() */ Index: modules/regex/src/main/java/java/util/regex/Lexer.java =================================================================== --- modules/regex/src/main/java/java/util/regex/Lexer.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/Lexer.java (working copy) @@ -123,9 +123,6 @@ */ static final int MAX_HANGUL_DECOMPOSITION_LENGTH = 3; - //maximum value of codepoint for basic multilingual pane of Unicode - static final int MAX_CODEPOINT_BASIC_MULTILINGUAL_PANE = 0xFFFF; - /* * Following constants are needed for Hangul canonical decomposition. * Hangul decomposition algorithm and constants are taken according @@ -363,8 +360,8 @@ singleDecompTable = SingleDecompositions.getHashSingleDecompositions(); singleDecompTableSize = singleDecompTable.size; - for (int i = 0; i < inputLength; i += Lexer.charCount(ch)) { - ch = Lexer.codePointAt(inputChars, i); + for (int i = 0; i < inputLength; i += Character.charCount(ch)) { + ch = Character.codePointAt(inputChars, i); inputCodePoints[inputCodePointsIndex++] = ch; } @@ -425,7 +422,7 @@ * Translating into UTF-16 encoding */ for (int i = 0; i < decompHangulIndex; i++) { - result.append(Lexer.toChars(decompHangul[i])); + result.append(Character.toChars(decompHangul[i])); } return result.toString(); @@ -443,7 +440,7 @@ static int [] getCanonicalOrder(int [] inputInts, int length) { int inputLength = (length < inputInts.length) ? length - : inputInts.length; + : inputInts.length; /* * Simple bubble-sort algorithm. @@ -509,19 +506,23 @@ reread = false; // read next character analize it and construct token: // // - lookAhead = (index < pattern.length) ? pattern[nextIndex()] : 0; + + lookAhead = (index < pattern.length) ? nextCodePoint() : 0; lookAheadST = null; if (mode == Lexer.MODE_ESCAPE) { if (lookAhead == '\\') { + + //need not care about supplementary codepoints here lookAhead = (index < pattern.length) ? pattern[nextIndex()] : 0; switch (lookAhead) { case 'E': { mode = saved_mode; + lookAhead = (index <= pattern.length - 2) - ? pattern[nextIndex()] + ? nextCodePoint() : 0; break; } @@ -538,7 +539,8 @@ } if (lookAhead == '\\') { - lookAhead = (index < pattern.length - 2) ? pattern[nextIndex()] + + lookAhead = (index < pattern.length - 2) ? nextCodePoint() : -1; switch (lookAhead) { case -1: @@ -647,6 +649,8 @@ break; case 'c': { if (index < pattern.length - 2) { + + //need not care about supplementary codepoints here lookAhead = (pattern[nextIndex()] & 0x1f); break; } else { @@ -961,6 +965,8 @@ * Returns true if current character is plain token. */ public static boolean isLetter(int ch) { + + //all supplementary codepoints have integer value that is >= 0; return ch >= 0; } @@ -974,6 +980,28 @@ return !isEmpty() && !isSpecial() && isLetter(ch); } + /* + * Note that Character class methods + * isHighSurrogate(), isLowSurrogate() + * take char parameter while we need an int + * parameter without truncation to char value + */ + public boolean isHighSurrogate() { + return (ch <= 0xDBFF) && (ch >= 0xD800); + } + + public boolean isLowSurrogate() { + return (ch <= 0xDFFF) && (ch >= 0xDC00); + } + + public static boolean isHighSurrogate(int ch) { + return (ch <= 0xDBFF) && (ch >= 0xD800); + } + + public static boolean isLowSurrogate(int ch) { + return (ch <= 0xDFFF) && (ch >= 0xDC00); + } + /** * Process hexadecimal integer. */ @@ -1029,7 +1057,7 @@ } /** - * Process expression flags givent with (?idmsux-idmsux) + * Process expression flags given with (?idmsux-idmsux) */ private int readFlags() { char ch; @@ -1162,7 +1190,7 @@ * "3.12 Conjoining Jamo Behavior". * * @param ch - given Hangul syllable - * @return canonical decoposition of ch. + * @return canonical decomposition of ch. */ static int [] getHangulDecomposition(int ch) { int SIndex = ch - SBase; @@ -1200,59 +1228,6 @@ ? 0 : canClass; } - - /** - * Simple stub to Character.charCount(). - * - * @param - ch Unicode codepoint - * @return number of chars that are occupied by Unicode - * codepoint ch in UTF-16 encoding. - */ - final static int charCount(int ch) { - - //return Character.charCount(ch); - return 1; - } - - /** - * Simple stub to Character.codePointAt(). - * - * @param - source - * @param - index - * @return Unicode codepoint at given index at source. - * Note that codepoint can reside in two adjacent chars. - */ - final static int codePointAt(char [] source, int index) { - - //return Character.codePointAt(source, index); - return source[index]; - } - - /** - * Simple stub to Character.toChars(). - * - * @param - ch Unicode codepoint - * @return UTF-16 encoding of given code point. - */ - final static char [] toChars(int ch) { - - //return Character.toChars(ch); - return new char [] {(char) ch}; - } - - /** - * Simple stub to Character.isSurrogatePair(). - * - * @param high high-surrogate char - * @param low low-surrogate char - * @return true if high and low compose an UTF-16 encoding - * of some Unicode codepoint (we call such codepoint "surrogate") - */ - final static boolean isSurrogatePair(char high, char low) { - - //return Character.isSurrogatePair(char, low) - return false; - } /** * Tests if given codepoint is a canonical decomposition of another @@ -1283,38 +1258,25 @@ static boolean hasDecompositionNonNullCanClass(int ch) { return ch == 0x0340 | ch == 0x0341 | ch == 0x0343 | ch == 0x0344; } - - /** - * Reads next Unicode codepoint. - * - * @return current Unicode codepoint and moves string - * index to the next one. - */ - int nextChar() { - int ch = 0; - if (!this.isEmpty()) { - char nextChar = (char) lookAhead; - char curChar = (char) ch; - - if (Lexer.isSurrogatePair(curChar, nextChar)){ - - /* - * Note that it's slow to create new arrays each time - * when calling to nextChar(). This should be optimized - * later when we will actively use surrogate codepoints. - * You can consider this as simple stub. - */ - char [] curCodePointUTF16 = new char [] {curChar, nextChar}; - ch = Lexer.codePointAt(curCodePointUTF16, 0); - next(); - next(); - } else { - ch = next(); + private int nextCodePoint() { + char high = pattern[nextIndex()]; + + if (Character.isHighSurrogate(high)) { + + //low and high char may be delimetered by spaces + int lowExpectedIndex = prevNW + 1; + + if (lowExpectedIndex < pattern.length) { + char low = pattern[lowExpectedIndex]; + if (Character.isLowSurrogate(low)) { + nextIndex(); + return Character.toCodePoint(high, low); + } } - } + } - return ch; + return (int) high; } /** @@ -1330,7 +1292,7 @@ //Lexer.getCanonicalClass(ch) == 0 boolean isBoundary = (canClass == canonClassesTableSize); - return isBoundary; + return isBoundary; } /** Index: modules/regex/src/main/java/java/util/regex/SequenceSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/SequenceSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/SequenceSet.java (working copy) @@ -106,6 +106,16 @@ return ((CharSet) set).getChar() == string.charAt(0); } else if (set instanceof RangeSet) { return ((RangeSet) set).accepts(0, string.substring(0, 1)) > 0; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet) set).contains(string.charAt(0)) + || ((string.length() > 1) && ((SupplRangeSet) set).contains(Character + .toCodePoint(string.charAt(0), string.charAt(1)))); + } else if ((set instanceof SupplCharSet)) { + return (string.length() > 1) + ? ((SupplCharSet) set).getCodePoint() + == Character.toCodePoint(string.charAt(0), + string.charAt(1)) + : false; } return true; Index: modules/regex/src/main/java/java/util/regex/DotQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/DotQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/DotQuantifierSet.java (working copy) @@ -28,11 +28,11 @@ * @author Nikolay A. Kuznetsov * @version $Revision: 1.11.2.2 $ */ -class DotQuantifierSet extends LeafQuantifierSet { +class DotQuantifierSet extends QuantifierSet { AbstractLineTerminator lt; - public DotQuantifierSet(LeafSet innerSet, AbstractSet next, int type, + public DotQuantifierSet(AbstractSet innerSet, AbstractSet next, int type, AbstractLineTerminator lt) { super(innerSet, next, type); this.lt = lt; @@ -47,7 +47,7 @@ findLineTerminator(stringIndex, strLength, testString); if (startSearch < 0) { - startSearch = matchResult.getRightBound(); + startSearch = strLength; } if (startSearch <= stringIndex) { @@ -96,6 +96,9 @@ return res; } + /* + * All line terminators are from Basic Multilingual Pane + */ private int findLineTerminator(int from, int to, CharSequence testString) { for (int i = from; i < to; i++) { if (lt.isLineTerminator(testString.charAt(i))) { @@ -114,4 +117,7 @@ return -1; } + protected String getName() { + return ""; + } } Index: modules/regex/src/main/java/java/util/regex/DotAllSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/DotAllSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/DotAllSet.java (working copy) @@ -26,17 +26,48 @@ * @author Nikolay A. Kuznetsov * @version $Revision: 1.6.2.2 $ */ -class DotAllSet extends LeafSet { +class DotAllSet extends JointSet { - public int accepts(int strIndex, CharSequence testString) { - return 1; - } + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char high = testString.charAt(stringIndex); + + if (Character.isHighSurrogate(high) && (stringIndex + 2 <= strLength)) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isSurrogatePair(high, low)) { + return next.matches(stringIndex + 2, testString, matchResult); + } + } + return next.matches(stringIndex + 1, testString, matchResult); + } protected String getName() { return "DotAll"; //$NON-NLS-1$ } + + public AbstractSet getNext() { + return this.next; + } + + public void setNext(AbstractSet next) { + this.next = next; + } + public int getType() { return AbstractSet.TYPE_DOTSET; } + + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } } Index: modules/regex/src/main/java/java/util/regex/CompositeRangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/CompositeRangeSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/CompositeRangeSet.java (revision 0) @@ -0,0 +1,165 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * This class is used to split the range that contains surrogate + * characters into two ranges: the first consisting of these surrogate + * characters and the second consisting of all others characters + * from the parent range. + * This class represents the parent range splitted in such a manner. + */ +class CompositeRangeSet extends JointSet { + + //range without surrogates + AbstractSet withoutSurrogates; + + //range containing surrogates only + AbstractSet withSurrogates; + + public CompositeRangeSet(AbstractSet withoutSurrogates, + AbstractSet withSurrogates, AbstractSet next) { + this.withoutSurrogates = withoutSurrogates; + this.withSurrogates = withSurrogates; + setNext(next); + } + + public CompositeRangeSet(AbstractSet withoutSurrogates, + AbstractSet withSurrogates) { + this.withoutSurrogates = withoutSurrogates; + this.withSurrogates = withSurrogates; + } + + /** + * Returns the next. + */ + public AbstractSet getNext() { + return this.next; + } + + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int shift = withoutSurrogates.matches(stringIndex, testString, matchResult); + + if (shift < 0) { + shift = withSurrogates.matches(stringIndex, testString, matchResult); + } + + if (shift >= 0) { + return shift; + } + return -1; + } + + /** + * Sets next abstract set. + * @param next + * The next to set. + */ + public void setNext(AbstractSet next) { + this.next = next; + withSurrogates.setNext(next); + withoutSurrogates.setNext(next); + } + + public AbstractSet getSurrogates() { + return withSurrogates; + } + + public AbstractSet getWithoutSurrogates() { + return withoutSurrogates; + } + + protected String getName() { + return "CompositeRangeSet: " + " " + + withoutSurrogates + " " + + withSurrogates; + } + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } + + public boolean first(AbstractSet set) { + return true; + } +} Index: modules/regex/src/main/java/java/util/regex/DotSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/DotSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/DotSet.java (working copy) @@ -26,7 +26,7 @@ * @author Nikolay A. Kuznetsov * @version $Revision: 1.12.2.2 $ */ -final class DotSet extends LeafSet { +final class DotSet extends JointSet { AbstractLineTerminator lt; @@ -35,21 +35,47 @@ this.lt = lt; } - public int accepts(int strIndex, CharSequence testString) { - char ch = testString.charAt(strIndex); - return lt.isLineTerminator(ch) ? -1 : 1; + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); - /* - * return (strIndex strLength) { + matchResult.hitEnd = true; + return -1; + } + char high = testString.charAt(stringIndex); + + if (Character.isHighSurrogate(high) && (stringIndex + 2 <= strLength)) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isSurrogatePair(high, low)) { + return lt.isLineTerminator(Character.toCodePoint(high, low))? -1 + : next.matches(stringIndex + 2, testString, matchResult); + } + } + + return lt.isLineTerminator(high)? -1 + : next.matches(stringIndex + 1, testString, matchResult); } protected String getName() { return "."; //$NON-NLS-1$ } + + public AbstractSet getNext() { + return this.next; + } + + public void setNext(AbstractSet next) { + this.next = next; + } + public int getType() { return AbstractSet.TYPE_DOTSET; } + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } } Index: modules/regex/src/main/java/java/util/regex/CharClass.java =================================================================== --- modules/regex/src/main/java/java/util/regex/CharClass.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/CharClass.java (working copy) @@ -39,6 +39,8 @@ // Flag indicates if there are unicode supplements boolean hasUCI = false; + boolean invertedSurrogates = false; + boolean inverted = false; boolean hideBits = false; @@ -60,6 +62,10 @@ setNegative(negative); } + /* + * We can use this method safely even if nonBitSet != null + * due to specific of range constrcutions in regular expressions. + */ public CharClass add(int ch) { if (ci) { if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { @@ -70,52 +76,151 @@ } } else if (uci && ch > 128) { hasUCI = true; - ch = Character.toLowerCase(Character.toUpperCase((char) ch)); + ch = Character.toLowerCase(Character.toUpperCase(ch)); // return this; } } + + if (Lexer.isHighSurrogate(ch) || Lexer.isLowSurrogate(ch)) { + if (!invertedSurrogates) { + lowHighSurrogates.set(ch - Character.MIN_SURROGATE); + } else { + lowHighSurrogates.clear(ch - Character.MIN_SURROGATE); + } + } + if (!inverted) { bits.set(ch); } else - bits.clear(); + bits.clear(ch); + if (!mayContainSupplCodepoints && Character.isSupplementaryCodePoint(ch)) { + mayContainSupplCodepoints = true; + } + return this; } + /* + * The difference between add(AbstarctCharClass) and union(AbstractCharClass) + * is that add() is used for constructions like "[^abc\\d]" + * (this pattern doesn't match "1") + * while union is used for constructions like "[^abc[\\d]]" + * (this pattern matches "1"). + */ public CharClass add(final AbstractCharClass cc) { - if (cc.getBits() != null) { + + if (!mayContainSupplCodepoints && cc.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + + if (!invertedSurrogates) { + + //A | !B = ! ((A ^ B) & B) + if (cc.altSurrogates) { + lowHighSurrogates.xor(cc.getLowHighSurrogates()); + lowHighSurrogates.and(cc.getLowHighSurrogates()); + altSurrogates = !altSurrogates; + invertedSurrogates = true; + + //A | B + } else { + lowHighSurrogates.or(cc.getLowHighSurrogates()); + } + } else { + + //!A | !B = !(A & B) + if (cc.altSurrogates) { + lowHighSurrogates.and(cc.getLowHighSurrogates()); + + //!A | B = !(A & !B) + } else { + lowHighSurrogates.andNot(cc.getLowHighSurrogates()); + } + } + + if (!hideBits && cc.getBits() != null) { if (!inverted) { + + //A | !B = ! ((A ^ B) & B) if (cc.isNegative()) { bits.xor(cc.getBits()); bits.and(cc.getBits()); alt = !alt; inverted = true; + + //A | B } else { bits.or(cc.getBits()); } } else { + + //!A | !B = !(A & B) if (cc.isNegative()) { bits.and(cc.getBits()); + + //!A | B = !(A & !B) } else { bits.andNot(cc.getBits()); } } - } else { + } else { + final boolean curAlt = alt; + if (nonBitSet == null) { - // hide bits true at the moment - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return cc.contains(ch) || bits.get(ch); + + if (curAlt && !inverted && bits.isEmpty()) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return cc.contains(ch); + } + }; + //alt = true; + } else { + + /* + * We keep the value of alt unchanged for + * constructions like [^[abc]fgb] by using + * the formula a ^ b == !a ^ !b. + */ + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !((curAlt ^ bits.get(ch)) + || ((curAlt ^ inverted) ^ cc.contains(ch))); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return (curAlt ^ bits.get(ch)) + || ((curAlt ^ inverted) ^ cc.contains(ch)); + } + }; + //alt = false } - }; - hideBits = true; + } + + hideBits = true; } else { final AbstractCharClass nb = nonBitSet; - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return nb.contains(ch) || cc.contains(ch); - } - }; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !(curAlt ^ (nb.contains(ch) || cc.contains(ch))); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return curAlt ^ (nb.contains(ch) || cc.contains(ch)); + } + }; + //alt = false + } } } @@ -125,7 +230,11 @@ public CharClass add(int st, int end) { if (st > end) throw new IllegalArgumentException(); - if (!ci) { + if (!ci + + //no intersection with surrogate characters + && (end < Character.MIN_SURROGATE + || st > Character.MAX_SURROGATE)) { if (!inverted) { bits.set(st, end + 1); } else { @@ -138,81 +247,247 @@ } return this; } - + // OR operation public void union(final AbstractCharClass clazz) { + if (!mayContainSupplCodepoints + && clazz.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + if (clazz.hasUCI()) this.hasUCI = true; + + + if (altSurrogates ^ clazz.altSurrogates) { + + //!A | B = !(A & !B) + if (altSurrogates) { + lowHighSurrogates.andNot(clazz.getLowHighSurrogates()); + + //A | !B = !((A ^ B) & B) + } else { + lowHighSurrogates.xor(clazz.getLowHighSurrogates()); + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + altSurrogates = true; + } + + } else { + + //!A | !B = !(A & B) + if (altSurrogates) { + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + + //A | B + } else { + lowHighSurrogates.or(clazz.getLowHighSurrogates()); + } + } + if (!hideBits && clazz.getBits() != null) { if (alt ^ clazz.isNegative()) { + + //!A | B = !(A & !B) if (alt) { bits.andNot(clazz.getBits()); + + //A | !B = !((A ^ B) & B) } else { bits.xor(clazz.getBits()); bits.and(clazz.getBits()); + alt = true; } - alt = true; + } else { - if (alt) { + + //!A | !B = !(A & B) + if (alt) { bits.and(clazz.getBits()); - } else { + + //A | B + } else { bits.or(clazz.getBits()); } } } else { + final boolean curAlt = alt; + if (nonBitSet == null) { - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return clazz.contains(ch) || bits.get(ch); + + if (!inverted && bits.isEmpty()) { + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !clazz.contains(ch); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return clazz.contains(ch); + } + }; + //alt = false } - }; + } else { + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !(clazz.contains(ch) || (curAlt ^ bits.get(ch))); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return clazz.contains(ch) || (curAlt ^ bits.get(ch)); + } + }; + //alt = false + } + } hideBits = true; } else { final AbstractCharClass nb = nonBitSet; - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return nb.contains(ch) || clazz.contains(ch); - } - }; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !((curAlt ^ nb.contains(ch)) || clazz.contains(ch)); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return (curAlt ^ nb.contains(ch)) || clazz.contains(ch); + } + }; + //alt = false + } } } } // AND operation public void intersection(final AbstractCharClass clazz) { + if (!mayContainSupplCodepoints + && clazz.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + if (clazz.hasUCI()) this.hasUCI = true; + + if (altSurrogates ^ clazz.altSurrogates) { + + //!A & B = ((A ^ B) & B) + if (altSurrogates) { + lowHighSurrogates.xor(clazz.getLowHighSurrogates()); + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + altSurrogates = false; + + //A & !B + } else { + lowHighSurrogates.andNot(clazz.getLowHighSurrogates()); + } + } else { + + //!A & !B = !(A | B) + if (altSurrogates) { + lowHighSurrogates.or(clazz.getLowHighSurrogates()); + + //A & B + } else { + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + } + } + if (!hideBits && clazz.getBits() != null) { + if (alt ^ clazz.isNegative()) { + + //!A & B = ((A ^ B) & B) if (alt) { bits.xor(clazz.getBits()); bits.and(clazz.getBits()); - setNegative(false); + alt = false; + + //A & !B } else { bits.andNot(clazz.getBits()); } } else { + + //!A & !B = !(A | B) if (alt) { bits.or(clazz.getBits()); + + //A & B } else { bits.and(clazz.getBits()); } } } else { - if (nonBitSet == null) { - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return bits.get(ch) && clazz.contains(ch); + final boolean curAlt = alt; + + if (nonBitSet == null) { + + if (!inverted && bits.isEmpty()) { + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !clazz.contains(ch); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return clazz.contains(ch); + } + }; + //alt = false } - }; + } else { + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !(clazz.contains(ch) && (curAlt ^ bits.get(ch))); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return clazz.contains(ch) && (curAlt ^ bits.get(ch)); + } + }; + //alt = false + } + } hideBits = true; } else { final AbstractCharClass nb = nonBitSet; - nonBitSet = new AbstractCharClass() { - public boolean contains(int ch) { - return nb.contains(ch) && clazz.contains(ch); - } - }; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return !((curAlt ^ nb.contains(ch)) && clazz.contains(ch)); + } + }; + //alt = true + } else { + nonBitSet = new AbstractCharClass() { + public boolean contains(int ch) { + return (curAlt ^ nb.contains(ch)) && clazz.contains(ch); + } + }; + //alt = false + } } } } @@ -243,9 +518,15 @@ return bits; } + protected BitSet getLowHighSurrogates() { + return lowHighSurrogates; + } + public AbstractCharClass getInstance() { + if (nonBitSet == null) { final BitSet bs = getBits(); + AbstractCharClass res = new AbstractCharClass() { public boolean contains(int ch) { return this.alt ^ bs.get(ch); @@ -255,7 +536,7 @@ StringBuffer temp = new StringBuffer(); for (int i = bs.nextSetBit(0); i >= 0; i = bs .nextSetBit(i + 1)) { - temp.append((char) i); + temp.append(Character.toChars(i)); temp.append('|'); } @@ -272,10 +553,11 @@ } } + //for debugging purposes only public String toString() { StringBuffer temp = new StringBuffer(); for (int i = bits.nextSetBit(0); i >= 0; i = bits.nextSetBit(i + 1)) { - temp.append((char) i); + temp.append(Character.toChars(i)); temp.append('|'); } Index: modules/regex/src/main/java/java/util/regex/LeafQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/LeafQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/LeafQuantifierSet.java (working copy) @@ -52,7 +52,7 @@ return shift; } - stringIndex--; + stringIndex -= leaf.charCount(); } return -1; } Index: modules/regex/src/main/java/java/util/regex/HighSurrogateCharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/HighSurrogateCharSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/HighSurrogateCharSet.java (revision 0) @@ -0,0 +1,250 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * This class represents high surrogate character. + */ +class HighSurrogateCharSet extends JointSet{ + + /* + * Note that we can use high and low surrogate characters + * that don't combine into supplementary code point. + * See http://www.unicode.org/reports/tr18/#Supplementary_Characters + */ + + private char high; + + public HighSurrogateCharSet(char high) { + this.high = high; + } + + /** + * Returns the next. + */ + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * @param next + * The next to set. + */ + public void setNext(AbstractSet next) { + this.next = next; + } + + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char high = testString.charAt(stringIndex); + + if (stringIndex + 1 < strLength) { + char low = testString.charAt(stringIndex + 1); + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isLowSurrogate(low)) { + return -1; + } + } + + if (this.high == high) { + return next.matches(stringIndex + 1, testString, + matchResult); + } + + return -1; + } + + public int find(int strIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String) testString; + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + + strIndex = testStr.indexOf(high, strIndex); + if (strIndex < 0) + return -1; + + if (strIndex + 1 < strLength ) { + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isLowSurrogate(testStr.charAt(strIndex + 1))) { + strIndex += 2; + continue; + } + } + + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; + } + + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + public int findBack(int strIndex, int lastIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String) testString; + int strLength = matchResult.getRightBound(); + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(high, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if (lastIndex + 1 < strLength) { + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isLowSurrogate(testStr.charAt(lastIndex + 1))) { + lastIndex--; + continue; + } + } + + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + protected String getName() { + return "" + high; + } + + protected int getChar() { + return high; + } + + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } else if (set instanceof LowSurrogateCharSet) { + return false; + } else if (set instanceof HighSurrogateCharSet) { + return ((HighSurrogateCharSet) set).high == this.high; + } + + return true; + } + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/LowHighSurrogateRangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/LowHighSurrogateRangeSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/LowHighSurrogateRangeSet.java (revision 0) @@ -0,0 +1,187 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/* + * This class is a range that contains only surrogate characters. + */ +class LowHighSurrogateRangeSet extends JointSet { + + protected AbstractCharClass surrChars; + + protected boolean alt = false; + + public LowHighSurrogateRangeSet(AbstractCharClass surrChars, AbstractSet next) { + this.surrChars = surrChars.getInstance(); + this.alt = surrChars.alt; + setNext(next); + } + + public LowHighSurrogateRangeSet(AbstractCharClass surrChars) { + this.surrChars = surrChars.getInstance(); + this.alt = surrChars.alt; + } + + /** + * Returns the next. + */ + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * @param next + * The next to set. + */ + public void setNext(AbstractSet next) { + this.next = next; + } + + /** + * Returns stringIndex+shift, the next position to match + */ + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int startStr = matchResult.getLeftBound(); + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char ch = testString.charAt(stringIndex); + + if (!surrChars.contains(ch)) { + return -1; + } + + if (Character.isHighSurrogate(ch)) { + + if (stringIndex + 1 < strLength) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isLowSurrogate(low)) { + return -1; + } + } + } else if (Character.isLowSurrogate(ch)) { + + if (stringIndex > startStr) { + char high = testString.charAt(stringIndex - 1); + + if (Character.isHighSurrogate(high)) { + return -1; + } + } + } + + return next.matches(stringIndex + 1, testString, matchResult); + } + + protected String getName() { + return "range:" + (alt ? "^ " : " ") + surrChars.toString(); + } + + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } + + return true; + } + + protected AbstractCharClass getChars() { + return surrChars; + } + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/UnifiedQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/UnifiedQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/UnifiedQuantifierSet.java (working copy) @@ -43,7 +43,7 @@ MatchResultImpl matchResult) { while (stringIndex + leaf.charCount() <= matchResult.getRightBound() && leaf.accepts(stringIndex, testString) > 0) - stringIndex++; + stringIndex += leaf.charCount(); return next.matches(stringIndex, testString, matchResult); } @@ -53,11 +53,11 @@ int startSearch = next.find(stringIndex, testString, matchResult); if (startSearch < 0) return -1; - int newSearch = startSearch - 1; + int newSearch = startSearch - leaf.charCount(); while (newSearch >= stringIndex && leaf.accepts(newSearch, testString) > 0) { startSearch = newSearch; - newSearch--; + newSearch -= leaf.charCount(); } return startSearch; Index: modules/regex/src/main/java/java/util/regex/SupplRangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/SupplRangeSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/SupplRangeSet.java (revision 0) @@ -0,0 +1,173 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * Represents node accepting single character from the given char class. + * This character can be supplementary (2 chars needed to represent) or from + * basic multilingual pane (1 needed char to represent it). + */ +class SupplRangeSet extends JointSet { + + protected AbstractCharClass chars; + + protected boolean alt = false; + + public SupplRangeSet(AbstractCharClass cs, AbstractSet next) { + this.chars = cs.getInstance(); + this.alt = cs.alt; + this.next = next; + } + + public SupplRangeSet(AbstractCharClass cc) { + this.chars = cc.getInstance(); + this.alt = cc.alt; + } + + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int offset = -1; + + if (stringIndex < strLength) { + char high = testString.charAt(stringIndex++); + + if (contains(high) && + (offset = next.matches(stringIndex, testString, matchResult)) > 0) { + return offset; + } + + if (stringIndex < strLength) { + char low = testString.charAt(stringIndex++); + + if (Character.isSurrogatePair(high, low) + && contains(Character.toCodePoint(high, low))) { + return next.matches(stringIndex, testString, matchResult); + } + } + } + + return -1; + } + + protected String getName() { + return "range:" + (alt ? "^ " : " ") + chars.toString(); + } + + public boolean contains(int ch) { + return chars.contains(ch); + } + + public boolean first(AbstractSet set) { + if (set instanceof SupplCharSet) { + return AbstractCharClass.intersects(chars, ((SupplCharSet) set) + .getCodePoint()); + } else if (set instanceof CharSet) { + return AbstractCharClass.intersects(chars, ((CharSet) set) + .getChar()); + } else if (set instanceof SupplRangeSet) { + return AbstractCharClass.intersects(chars, ((SupplRangeSet) set) + .chars); + } else if (set instanceof RangeSet) { + return AbstractCharClass.intersects(chars, ((RangeSet) set) + .getChars()); + } + + return true; + } + + protected AbstractCharClass getChars() { + return chars; + } + + public AbstractSet getNext() { + return next; + } + + public void setNext(AbstractSet next) { + this.next = next; + } + + public boolean hasConsumed(MatchResultImpl mr) { + return true; + } +} Index: modules/regex/src/main/java/java/util/regex/SingleDecompositions.java =================================================================== --- modules/regex/src/main/java/java/util/regex/SingleDecompositions.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/SingleDecompositions.java (working copy) @@ -18,7 +18,7 @@ /** * This class gives us a hashtable that contains information about - * symbols that have decomposition and canonical class 0 that is + * symbols that are one symbol decompositions that is * generated from * http://www.unicode.org/Public/4.0-Update/UnicodeData-4.0.0.txt. */ Index: modules/regex/src/main/java/java/util/regex/RangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/RangeSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/RangeSet.java (working copy) @@ -57,8 +57,18 @@ return AbstractCharClass.intersects(chars, ((CharSet) set) .getChar()); } else if (set instanceof RangeSet) { - return AbstractCharClass.intersects(chars, ((RangeSet) set).chars); + return AbstractCharClass.intersects(chars, ((RangeSet) set) + .chars); + } else if (set instanceof SupplRangeSet) { + return AbstractCharClass.intersects(chars, ((SupplRangeSet) set) + .getChars()); + } else if (set instanceof SupplCharSet) { + return false; } return true; } + + protected AbstractCharClass getChars() { + return chars; + } } Index: modules/regex/src/main/java/java/util/regex/Pattern.java =================================================================== --- modules/regex/src/main/java/java/util/regex/Pattern.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/Pattern.java (working copy) @@ -303,12 +303,10 @@ if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) lexemes.next(); } - - if (!auxRange.hasUCI()) { - return new RangeSet(auxRange, last); - } else { - return new UCIRangeSet(auxRange, last); - } + AbstractSet rangeSet = processRangeSet(auxRange); + rangeSet.setNext(last); + + return rangeSet; } /** @@ -436,8 +434,11 @@ */ private AbstractSet processSequence(AbstractSet last) { StringBuffer substring = new StringBuffer(); + while (!lexemes.isEmpty() && lexemes.isLetter() + && !lexemes.isHighSurrogate() + && !lexemes.isLowSurrogate() && ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end // of // pattern @@ -447,7 +448,13 @@ || (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes .lookAhead() == Lexer.CHAR_DOLLAR)) { - substring.append((char) lexemes.next()); + int ch = lexemes.next(); + + if (Character.isSupplementaryCodePoint(ch)) { + substring.append(Character.toChars(ch)); + } else { + substring.append((char) ch); + } } if (!hasFlag(Pattern.CASE_INSENSITIVE)) { return new SequenceSet(substring); @@ -469,7 +476,7 @@ int curSymbIndex = -1; if (!lexemes.isEmpty() && lexemes.isLetter()) { - curSymb = lexemes.nextChar(); + curSymb = lexemes.next(); codePoints [readCodePoints] = curSymb; curSymbIndex = curSymb - Lexer.LBase; } @@ -485,12 +492,12 @@ codePointsHangul[readCodePoints++] = (char) curSymb; curSymb = lexemes.peek(); - curSymbIndex = curSymb - Lexer.VBase; - if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.VCount)) { - codePointsHangul [readCodePoints++] = (char) curSymb; - lexemes.next(); - curSymb = lexemes.peek(); - curSymbIndex = curSymb - Lexer.TBase; + curSymbIndex = curSymb - Lexer.VBase; + if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.VCount)) { + codePointsHangul [readCodePoints++] = (char) curSymb; + lexemes.next(); + curSymb = lexemes.peek(); + curSymbIndex = curSymb - Lexer.TBase; if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.TCount)) { codePointsHangul [readCodePoints++] = (char) curSymb; lexemes.next(); @@ -502,18 +509,18 @@ //LV syllable return new HangulDecomposedCharSet(codePointsHangul, 2); } - } else { + } else { //L jamo if (!hasFlag(Pattern.CASE_INSENSITIVE)) { - return new CharSet(codePointsHangul[0]); - } else if (!hasFlag(Pattern.UNICODE_CASE)) { - return new CICharSet(codePointsHangul[0]); - } else { - return new UCICharSet(codePointsHangul[0]); - } - } - + return new CharSet(codePointsHangul[0]); + } else if (!hasFlag(Pattern.UNICODE_CASE)) { + return new CICharSet(codePointsHangul[0]); + } else { + return new UCICharSet(codePointsHangul[0]); + } + } + /* * We process single codepoint or decomposed codepoint. * We collect decomposed codepoint and obtain @@ -525,31 +532,15 @@ while((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH) && !lexemes.isEmpty() && lexemes.isLetter() && !Lexer.isDecomposedCharBoundary(lexemes.peek())) { - codePoints [readCodePoints++] = lexemes.nextChar(); + codePoints [readCodePoints++] = lexemes.next(); } - - if (readCodePoints == 0) { - return null; - } - + /* - * We have read an ordinary Basic Multilingual Pane symbol. + * We have read an ordinary symbol. */ - if (readCodePoints == 1 - - /* - * We compile supplementary codepoint into - * DecomposedCharSet for convenience. - */ - && curSymb <= Lexer.MAX_CODEPOINT_BASIC_MULTILINGUAL_PANE + if (readCodePoints == 1 && !Lexer.hasSingleCodepointDecomposition(codePoints[0])) { - if (!hasFlag(Pattern.CASE_INSENSITIVE)) { - return new CharSet((char) codePoints[0]); - } else if (!hasFlag(Pattern.UNICODE_CASE)) { - return new CICharSet((char) codePoints[0]); - } else { - return new UCICharSet((char) codePoints[0]); - } + return processCharSet(codePoints[0]); } else { if (!hasFlag(Pattern.CASE_INSENSITIVE)) { return new DecomposedCharSet(codePoints, readCodePoints); @@ -580,6 +571,9 @@ && !lexemes.isLetter()) { cur = processQuantifier(last, cur); } + } else if (lexemes.isHighSurrogate() || lexemes.isLowSurrogate()) { + AbstractSet term = processTerminal(last); + cur = processQuantifier(last, term); } else { cur = processSequence(last); } @@ -641,8 +635,19 @@ switch (quant) { case Lexer.QUANT_STAR: case Lexer.QUANT_PLUS: { + QuantifierSet q; + lexemes.next(); - GroupQuantifierSet q = new GroupQuantifierSet(term, last, quant); + if (term.getType() == AbstractSet.TYPE_DOTSET) { + if (!hasFlag(Pattern.DOTALL)) { + q = new DotQuantifierSet(term, last, quant, + AbstractLineTerminator.getInstance(flags)); + } else { + q = new DotAllQuantifierSet(term, last, quant); + } + } else { + q = new GroupQuantifierSet(term, last, quant); + } term.setNext(q); return q; } @@ -725,17 +730,8 @@ case Lexer.QUANT_STAR: case Lexer.QUANT_PLUS: { lexemes.next(); - LeafQuantifierSet q; - if (term.getType() == AbstractSet.TYPE_DOTSET) { - if (!hasFlag(Pattern.DOTALL)) { - q = new DotQuantifierSet(leaf, last, quant, - AbstractLineTerminator.getInstance(flags)); - } else { - q = new DotAllQuantifierSet(leaf, last, quant); - } - } else { - q = new LeafQuantifierSet(leaf, last, quant); - } + LeafQuantifierSet q = new LeafQuantifierSet(leaf, + last, quant); leaf.setNext(q); return q; } @@ -958,8 +954,10 @@ case 0: { AbstractCharClass cc = null; if ((cc = (AbstractCharClass) lexemes.peekSpecial()) != null) { - term = new RangeSet(cc); + term = processRangeSet(cc); } else if (!lexemes.isEmpty()) { + + //ch == 0 term = new CharSet((char) ch); } else { term = new EmptySet(last); @@ -971,19 +969,7 @@ default: { if (ch >= 0 && !lexemes.isSpecial()) { - if (hasFlag(Pattern.CASE_INSENSITIVE)) { - if ((ch >= 'a' && ch <= 'z') - || (ch >= 'A' && ch <= 'Z')) { - term = new CICharSet((char) ch); - } else if (hasFlag(Pattern.UNICODE_CASE) - && ch > 128) { - term = new UCICharSet((char) ch); - } else { - term = new CharSet((char) ch); - } - } else { - term = new CharSet((char) ch); - } + term = processCharSet(ch); lexemes.next(); } else if (ch == Lexer.CHAR_VERTICAL_BAR) { term = new EmptySet(last); @@ -1011,17 +997,16 @@ private AbstractSet processRange(boolean negative, AbstractSet last) { AbstractCharClass res = processRangeExpression(negative); - if (!res.hasUCI()) { - return new RangeSet(res, last); - } else { - return new UCIRangeSet(res, last); - } + AbstractSet rangeSet = processRangeSet(res); + rangeSet.setNext(last); + + return rangeSet; } /** * proceess [...] ranges */ - private AbstractCharClass processRangeExpression(boolean alt) { + private CharClass processRangeExpression(boolean alt) { CharClass res = new CharClass(alt, hasFlag(Pattern.CASE_INSENSITIVE), hasFlag(Pattern.UNICODE_CASE)); int buffer = -1; @@ -1042,6 +1027,10 @@ break; } case Lexer.CHAR_LEFT_SQUARE_BRACKET: { + if (buffer >= 0) { + res.add(buffer); + buffer = -1; + } lexemes.next(); boolean negative = false; if (lexemes.peek() == Lexer.CHAR_CARET) { @@ -1062,13 +1051,37 @@ if (buffer >= 0) res.add(buffer); buffer = lexemes.next(); - // if there is a start for subrange we will do an intersection - // otherwise treat '&' as normal character - if (lexemes.peek() == Lexer.CHAR_AMPERSAND - && lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) { - lexemes.next(); - intersection = true; - buffer = -1; + + /* + * if there is a start for subrange we will do an intersection + * otherwise treat '&' as a normal character + */ + if (lexemes.peek() == Lexer.CHAR_AMPERSAND) { + if (lexemes.lookAhead() + == Lexer.CHAR_LEFT_SQUARE_BRACKET) { + lexemes.next(); + intersection = true; + buffer = -1; + } else { + lexemes.next(); + if (firstInClass) { + + //skip "&&" at "[&&...]" or "[^&&...]" + res = processRangeExpression(false); + } else { + + //ignore "&&" at "[X&&]" ending where X != empty string + if (!(lexemes.peek() + == Lexer.CHAR_RIGHT_SQUARE_BRACKET)) { + res.intersection(processRangeExpression(false)); + } + } + + } + } else { + + //treat '&' as a normal character + buffer = '&'; } break; @@ -1095,7 +1108,10 @@ || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) { try { - res.add(buffer, (char) lexemes.peek()); + if (!Lexer.isLetter(cur)) { + cur = cur & 0xFFFF; + } + res.add(buffer, cur); } catch (Exception e) { throw new PatternSyntaxException( Messages.getString("regex.0E"), //$NON-NLS-1$ @@ -1113,6 +1129,14 @@ break; } + case Lexer.CHAR_CARET: { + if (buffer >= 0) + res.add(buffer); + buffer = '^'; + lexemes.next(); + break; + } + case 0: { if (buffer >= 0) res.add(buffer); @@ -1148,6 +1172,88 @@ return res; } + private AbstractSet processCharSet(int ch) { + boolean isSupplCodePoint = Character + .isSupplementaryCodePoint(ch); + + if (hasFlag(Pattern.CASE_INSENSITIVE)) { + + if ((ch >= 'a' && ch <= 'z') + || (ch >= 'A' && ch <= 'Z')) { + return new CICharSet((char) ch); + } else if (hasFlag(Pattern.UNICODE_CASE) + && ch > 128) { + if (isSupplCodePoint) { + return new UCISupplCharSet(ch); + } else if (Lexer.isLowSurrogate(ch)) { + + //we need no UCILowSurrogateCharSet + return new LowSurrogateCharSet((char) ch); + } else if (Lexer.isHighSurrogate(ch)) { + + //we need no UCIHighSurrogateCharSet + return new HighSurrogateCharSet((char) ch); + } else { + return new UCICharSet((char) ch); + } + } + } + + if (isSupplCodePoint) { + return new SupplCharSet(ch); + } else if (Lexer.isLowSurrogate(ch)) { + return new LowSurrogateCharSet((char) ch); + } else if (Lexer.isHighSurrogate(ch)) { + return new HighSurrogateCharSet((char) ch); + } else { + return new CharSet((char) ch); + } + } + + private AbstractSet processRangeSet(AbstractCharClass charClass) { + if (charClass.hasLowHighSurrogates()) { + AbstractCharClass surrogates = charClass.getSurrogates(); + LowHighSurrogateRangeSet lowHighSurrRangeSet + = new LowHighSurrogateRangeSet(surrogates); + + if (charClass.mayContainSupplCodepoints()) { + if (!charClass.hasUCI()) { + return new CompositeRangeSet( + new SupplRangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } else { + return new CompositeRangeSet( + new UCISupplRangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } + } + + if (!charClass.hasUCI()) { + return new CompositeRangeSet( + new RangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } else { + return new CompositeRangeSet( + new UCIRangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } + } + + if (charClass.mayContainSupplCodepoints()) { + if (!charClass.hasUCI()) { + return new SupplRangeSet(charClass); + } else { + return new UCISupplRangeSet(charClass); + } + } + + if (!charClass.hasUCI()) { + return new RangeSet(charClass); + } else { + return new UCIRangeSet(charClass); + } + } + /** * @com.intel.drl.spec_ref */ Index: modules/regex/src/main/java/java/util/regex/SupplCharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/SupplCharSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/SupplCharSet.java (revision 0) @@ -0,0 +1,197 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * Represents node accepting single supplementary codepoint. + */ +class SupplCharSet extends LeafSet { + + /* + * UTF-16 encoding of this supplementary codepoint + */ + private char high = 0; + + private char low = 0; + + //int value of this supplementary codepoint + private int ch; + + public SupplCharSet(int ch) { + charCount = 2; + this.ch = ch; + char [] chUTF16 = Character.toChars(ch); + high = chUTF16[0]; + + /* + * we suppose that SupplCharSet is + * build over supplementary codepoints only + */ + low = chUTF16[1]; + } + + public int accepts(int strIndex, CharSequence testString) { + char high = testString.charAt(strIndex++); + char low = testString.charAt(strIndex); + return ((this.high == high) && (this.low == low)) ? 2 : -1; + } + + public int find(int strIndex, CharSequence testString, + MatchResultImpl matchResult) { + + if (testString instanceof String) { + String testStr = (String) testString; + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + strIndex = testStr.indexOf(high, strIndex); + if (strIndex < 0) + return -1; + + strIndex++; + if (strIndex < strLength) { + char ch = testStr.charAt(strIndex); + + if ((low == ch) + && (next.matches(strIndex + 1, + testString, matchResult) >= 0)) { + return --strIndex; + } + strIndex++; + } + } + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + public int findBack(int strIndex, int lastIndex, CharSequence testString, + MatchResultImpl matchResult) { + + if (testString instanceof String) { + String testStr = (String) testString; + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(low, lastIndex); + lastIndex--; + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if ((high == testStr.charAt(lastIndex)) + && next.matches(lastIndex + 2, + testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + protected String getName() { + return "" + high + low; + } + + protected int getCodePoint() { + return ch; + } + + public boolean first(AbstractSet set) { + if (set instanceof SupplCharSet) { + return ((SupplCharSet) set).getCodePoint() == ch; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet) set) + .contains(ch); + } else if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } + + return true; + } +} \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/DotAllQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/DotAllQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/DotAllQuantifierSet.java (working copy) @@ -27,9 +27,9 @@ * @author Nikolay A. Kuznetsov * @version $Revision: 1.8.2.2 $ */ -class DotAllQuantifierSet extends LeafQuantifierSet { +class DotAllQuantifierSet extends QuantifierSet { - public DotAllQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + public DotAllQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { super(innerSet, next, type); } @@ -53,4 +53,8 @@ return -1; } } + + protected String getName() { + return ""; + } } Index: modules/regex/src/main/java/java/util/regex/PosPlusGroupQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/PosPlusGroupQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/PosPlusGroupQuantifierSet.java (working copy) @@ -31,7 +31,7 @@ public PosPlusGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { super(innerSet, next, type); - ((JointSet) innerSet).fSet.setNext(FSet.posFSet); + ((JointSet) innerSet).setNext(FSet.posFSet); } Index: modules/regex/src/main/java/java/util/regex/CharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/CharSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/CharSet.java (working copy) @@ -44,41 +44,48 @@ public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { - boolean res = false; - String testStr = testString.toString(); - int strLength = matchResult.getRightBound(); + if (testString instanceof String) { + String testStr = (String) testString; + int strLength = matchResult.getRightBound(); - while (strIndex < strLength) { - strIndex = testStr.indexOf(ch, strIndex); - if (strIndex < 0) - return -1; - if (next.matches(strIndex + 1, testString, matchResult) >= 0) { - return strIndex; + while (strIndex < strLength) { + strIndex = testStr.indexOf(ch, strIndex); + if (strIndex < 0) + return -1; + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; } - strIndex++; + + return -1; } - - return -1; + + return super.find(strIndex, testString, matchResult); } public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { - String testStr = testString.toString(); + if (testString instanceof String) { + String testStr = (String) testString; - while (lastIndex >= strIndex) { - lastIndex = testStr.lastIndexOf(ch, lastIndex); - if (lastIndex < 0 || lastIndex < strIndex) { - return -1; - } + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(ch, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } - if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { - return lastIndex; + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; } - lastIndex--; + return -1; } - - return -1; + + return super.findBack(strIndex, lastIndex, testString, matchResult); } protected String getName() { @@ -94,6 +101,10 @@ return ((CharSet) set).getChar() == ch; } else if (set instanceof RangeSet) { return ((RangeSet) set).accepts(0, Character.toString(ch)) > 0; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet) set).contains(ch); + } else if (set instanceof SupplCharSet) { + return false; } return true; Index: modules/regex/src/main/java/java/util/regex/UCICharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/UCICharSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/UCICharSet.java (working copy) @@ -43,8 +43,4 @@ protected String getName() { return "UCI " + ch; //$NON-NLS-1$ } - - protected char getChar() { - return ch; - } } \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/DecomposedCharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/DecomposedCharSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/DecomposedCharSet.java (working copy) @@ -197,7 +197,7 @@ StringBuffer strBuff = new StringBuffer(); for (int i = 0; i < decomposedCharLength; i++) { - strBuff.append(Lexer.toChars(decomposedChar[i])); + strBuff.append(Character.toChars(decomposedChar[i])); } decomposedCharUTF16 = strBuff.toString(); } @@ -230,9 +230,9 @@ char high = testString.charAt(strIndex++); char low = testString.charAt(strIndex); - if (Lexer.isSurrogatePair(high, low)) { + if (Character.isSurrogatePair(high, low)) { char [] curCodePointUTF16 = new char [] {high, low}; - curChar = Lexer.codePointAt(curCodePointUTF16, 0); + curChar = Character.codePointAt(curCodePointUTF16, 0); readCharsForCodePoint = 2; } else { curChar = high; Index: modules/regex/src/main/java/java/util/regex/LowSurrogateCharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/LowSurrogateCharSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/LowSurrogateCharSet.java (revision 0) @@ -0,0 +1,249 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * This class represents low surrogate character. + */ +class LowSurrogateCharSet extends JointSet{ + + /* + * Note that we can use high and low surrogate characters + * that don't combine into supplementary code point. + * See http://www.unicode.org/reports/tr18/#Supplementary_Characters + */ + private char low; + + public LowSurrogateCharSet(char low) { + this.low = low; + } + + /** + * Returns the next. + */ + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * @param next + * The next to set. + */ + public void setNext(AbstractSet next) { + this.next = next; + } + + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + + if (stringIndex + 1 > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + + char low = testString.charAt(stringIndex); + + if (stringIndex > matchResult.getLeftBound()) { + char high = testString.charAt(stringIndex - 1); + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isHighSurrogate(high)) { + return -1; + } + } + + if (this.low == low) { + return next.matches(stringIndex + 1, testString, + matchResult); + } + + return -1; + } + + public int find(int strIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String) testString; + int startStr = matchResult.getLeftBound(); + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + + strIndex = testStr.indexOf(low, strIndex); + if (strIndex < 0) + return -1; + + if (strIndex > startStr) { + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isHighSurrogate(testStr.charAt(strIndex - 1))) { + strIndex++; + continue; + } + } + + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; + } + + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + public int findBack(int strIndex, int lastIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (testString instanceof String) { + int startStr = matchResult.getLeftBound(); + String testStr = (String) testString; + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(low, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if (lastIndex > startStr) { + + /* + * we consider high surrogate followed by + * low surrogate as a codepoint + */ + if (Character.isHighSurrogate(testStr.charAt(lastIndex - 1))) { + lastIndex -= 2; + continue; + } + } + + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + protected String getName() { + return "" + low; + } + + protected int getChar() { + return low; + } + + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } else if (set instanceof HighSurrogateCharSet) { + return false; + } else if (set instanceof LowSurrogateCharSet) { + return ((LowSurrogateCharSet) set).low == this.low; + } + + return true; + } + + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/UCIRangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/UCIRangeSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/UCIRangeSet.java (working copy) @@ -40,6 +40,11 @@ this.alt = cs.alt; } + public UCIRangeSet(AbstractCharClass cc) { + this.chars = cc.getInstance(); + this.alt = cc.alt; + } + public int accepts(int strIndex, CharSequence testString) { return (chars.contains(Character.toLowerCase(Character .toUpperCase(testString.charAt(strIndex))))) ? 1 : -1; Index: modules/regex/src/main/java/java/util/regex/UCISupplCharSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/UCISupplCharSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/UCISupplCharSet.java (revision 0) @@ -0,0 +1,109 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * Represents node accepting single supplementary + * codepoint in Unicode case insensitive manner. + */ +class UCISupplCharSet extends LeafSet { + + //int value of this supplementary codepoint + private int ch; + + public UCISupplCharSet(int ch) { + charCount = 2; + this.ch = Character.toLowerCase(Character.toUpperCase(ch)); + } + + public int accepts(int strIndex, CharSequence testString) { + char high = testString.charAt(strIndex++); + char low = testString.charAt(strIndex); + return (this.ch == Character.toLowerCase(Character + .toUpperCase(Character + .toCodePoint(high, low)))) ? 2 : -1; + } + + protected String getName() { + return "UCI " + new String(Character.toChars(ch)); + } +} \ No newline at end of file Index: modules/regex/src/main/java/java/util/regex/AltQuantifierSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/AltQuantifierSet.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/AltQuantifierSet.java (working copy) @@ -34,7 +34,6 @@ public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { - int i = 0; int shift = 0; if ((shift = innerSet.matches(stringIndex, testString, matchResult)) >= 0) { Index: modules/regex/src/main/java/java/util/regex/AbstractCharClass.java =================================================================== --- modules/regex/src/main/java/java/util/regex/AbstractCharClass.java (revision 450389) +++ modules/regex/src/main/java/java/util/regex/AbstractCharClass.java (working copy) @@ -36,8 +36,26 @@ abstract class AbstractCharClass extends SpecialToken { protected boolean alt; + protected boolean altSurrogates; + + //Character.MAX_SURROGATE - Character.MIN_SURROGATE + 1 + static int SURROGATE_CARDINALITY = 2048; + + BitSet lowHighSurrogates = new BitSet(SURROGATE_CARDINALITY); + + AbstractCharClass charClassWithoutSurrogates = null; + + AbstractCharClass charClassWithSurrogates = null; + static PredefinedCharacterClasses charClasses = new PredefinedCharacterClasses(); + /* + * Indicates if this class may contain supplementary Unicode codepoints. + * If this flag is specified it doesn't mean that this class contains + * supplementary characters but may contain. + */ + protected boolean mayContainSupplCodepoints = false; + /** * Returns true if this char class contains character specified; * @@ -55,7 +73,21 @@ protected BitSet getBits() { return null; } + + protected BitSet getLowHighSurrogates() { + return lowHighSurrogates; + } + public boolean hasLowHighSurrogates() { + return altSurrogates + ? lowHighSurrogates.nextClearBit(0) < SURROGATE_CARDINALITY + : lowHighSurrogates.nextSetBit(0) < SURROGATE_CARDINALITY; + } + + public boolean mayContainSupplCodepoints() { + return mayContainSupplCodepoints; + } + public int getType() { return SpecialToken.TOK_CHARCLASS; } @@ -63,7 +95,55 @@ public AbstractCharClass getInstance() { return this; } + + public AbstractCharClass getSurrogates() { + + if (charClassWithSurrogates == null) { + final BitSet lHS = getLowHighSurrogates(); + charClassWithSurrogates = new AbstractCharClass() { + public boolean contains(int ch) { + int index = ch - Character.MIN_SURROGATE; + + return ((index >= 0) + && (index < AbstractCharClass.SURROGATE_CARDINALITY)) + ? this.altSurrogates ^ lHS.get(index) + : false; + } + }; + charClassWithSurrogates.setNegative(this.altSurrogates); + } + + return charClassWithSurrogates; + } + + public AbstractCharClass getWithoutSurrogates() { + if (charClassWithoutSurrogates == null) { + final BitSet lHS = getLowHighSurrogates(); + final AbstractCharClass thisClass = this; + + charClassWithoutSurrogates = new AbstractCharClass() { + public boolean contains(int ch) { + int index = ch - Character.MIN_SURROGATE; + + boolean containslHS = ((index >= 0) + && (index < AbstractCharClass.SURROGATE_CARDINALITY)) + ? this.altSurrogates ^ lHS.get(index) + : false; + + + return thisClass.contains(ch) + && !containslHS; + } + }; + charClassWithoutSurrogates.setNegative(isNegative()); + charClassWithoutSurrogates.mayContainSupplCodepoints + = mayContainSupplCodepoints; + } + + return charClassWithoutSurrogates; + } + public boolean hasUCI() { return false; } @@ -81,8 +161,13 @@ * @see #union(CharClass) */ public AbstractCharClass setNegative(boolean value) { - if (alt ^ value) + if (alt ^ value) { alt = !alt; + altSurrogates = !altSurrogates; + } + if (!mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } return this; } @@ -94,11 +179,11 @@ // Static methods and predefined classes // ----------------------------------------------------------------- - public static boolean intersects(char ch1, char ch2) { + public static boolean intersects(int ch1, int ch2) { return ch1 == ch2; } - public static boolean intersects(AbstractCharClass cc, char ch) { + public static boolean intersects(AbstractCharClass cc, int ch) { return cc.contains(ch); } @@ -141,7 +226,10 @@ static class LazyNonDigit extends LazyDigit { protected AbstractCharClass computeValue() { - return super.computeValue().setNegative(true); + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -154,7 +242,10 @@ static class LazyNonSpace extends LazySpace { protected AbstractCharClass computeValue() { - return super.computeValue().setNegative(true); + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -167,7 +258,10 @@ static class LazyNonWord extends LazyWord { protected AbstractCharClass computeValue() { - return super.computeValue().setNegative(true); + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -250,7 +344,8 @@ } public AbstractCharClass computeValue() { - return new CharClass().add(start, end); + AbstractCharClass chCl = new CharClass().add(start, end); + return chCl; } } @@ -262,45 +357,85 @@ static class LazyCategoryScope extends LazyCharClass { int category; + + boolean mayContainSupplCodepoints; - public LazyCategoryScope(int cat) { + boolean containsAllSurrogates; + + public LazyCategoryScope(int cat, boolean mayContainSupplCodepoints) { + this.mayContainSupplCodepoints = mayContainSupplCodepoints; this.category = cat; } + public LazyCategoryScope(int cat, boolean mayContainSupplCodepoints, + boolean containsAllSurrogates) { + this.containsAllSurrogates = containsAllSurrogates; + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + protected AbstractCharClass computeValue() { - return new UnicodeCategoryScope(category); + AbstractCharClass chCl = new UnicodeCategoryScope(category); + if (containsAllSurrogates) { + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + } + + chCl.mayContainSupplCodepoints = mayContainSupplCodepoints;; + return chCl; } } static class LazyCategory extends LazyCharClass { int category; - public LazyCategory(int cat) { + boolean mayContainSupplCodepoints; + + boolean containsAllSurrogates; + + public LazyCategory(int cat, boolean mayContainSupplCodepoints) { + this.mayContainSupplCodepoints = mayContainSupplCodepoints; this.category = cat; } - + public LazyCategory(int cat, boolean mayContainSupplCodepoints, + boolean containsAllSurrogates) { + this.containsAllSurrogates = containsAllSurrogates; + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + protected AbstractCharClass computeValue() { - return new UnicodeCategory(category); + AbstractCharClass chCl = new UnicodeCategory(category); + if (containsAllSurrogates) { + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + } + chCl.mayContainSupplCodepoints = mayContainSupplCodepoints;; + return chCl; } } static class LazyJavaLowerCase extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isLowerCase((char) ch); + return Character.isLowerCase(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaUpperCase extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isUpperCase((char) ch); + return Character.isUpperCase(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -308,7 +443,7 @@ protected AbstractCharClass computeValue() { return new AbstractCharClass() { public boolean contains(int ch) { - return Character.isWhitespace((char) ch); + return Character.isWhitespace(ch); } }; } @@ -318,7 +453,7 @@ protected AbstractCharClass computeValue() { return new AbstractCharClass() { public boolean contains(int ch) { - return Character.isMirrored((char) ch); + return Character.isMirrored(ch); } }; } @@ -326,31 +461,41 @@ static class LazyJavaDefined extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isDefined((char) ch); + return Character.isDefined(ch); } }; + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaDigit extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isDigit((char) ch); + return Character.isDigit(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaIdentifierIgnorable extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isIdentifierIgnorable((char) ch); + return Character.isIdentifierIgnorable(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -358,7 +503,7 @@ protected AbstractCharClass computeValue() { return new AbstractCharClass() { public boolean contains(int ch) { - return Character.isISOControl((char) ch); + return Character.isISOControl(ch); } }; } @@ -366,41 +511,53 @@ static class LazyJavaJavaIdentifierPart extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isJavaIdentifierPart((char) ch); + return Character.isJavaIdentifierPart(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaJavaIdentifierStart extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isJavaIdentifierStart((char) ch); + return Character.isJavaIdentifierStart(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaLetter extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isLetter((char) ch); + return Character.isLetter(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaLetterOrDigit extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isLetterOrDigit((char) ch); + return Character.isLetterOrDigit(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } @@ -408,7 +565,7 @@ protected AbstractCharClass computeValue() { return new AbstractCharClass() { public boolean contains(int ch) { - return Character.isSpaceChar((char) ch); + return Character.isSpaceChar(ch); } }; } @@ -418,7 +575,7 @@ protected AbstractCharClass computeValue() { return new AbstractCharClass() { public boolean contains(int ch) { - return Character.isTitleCase((char) ch); + return Character.isTitleCase(ch); } }; } @@ -426,24 +583,30 @@ static class LazyJavaUnicodeIdentifierPart extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isUnicodeIdentifierPart((char) ch); + return Character.isUnicodeIdentifierPart(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } static class LazyJavaUnicodeIdentifierStart extends LazyCharClass { protected AbstractCharClass computeValue() { - return new AbstractCharClass() { + AbstractCharClass chCl = new AbstractCharClass() { public boolean contains(int ch) { - return Character.isUnicodeIdentifierStart((char) ch); + return Character.isUnicodeIdentifierStart(ch); } }; + + chCl.mayContainSupplCodepoints = true; + return chCl; } } - + /** * character classes generated from * http://www.unicode.org/reports/tr18/ @@ -619,44 +782,43 @@ { "ArabicPresentationForms-B", new LazyRange(0xFE70, 0xFEFF) }, //$NON-NLS-1$ { "HalfwidthandFullwidthForms", new LazyRange(0xFF00, 0xFFEF) }, //$NON-NLS-1$ { "Specials", new LazySpecialsBlock() }, //$NON-NLS-1$ - { "Cn", new LazyCategory(Character.UNASSIGNED) }, //$NON-NLS-1$ - { "IsL", new LazyCategoryScope(0x3E) }, //$NON-NLS-1$ - { "Lu", new LazyCategory(Character.UPPERCASE_LETTER) }, //$NON-NLS-1$ - { "Ll", new LazyCategory(Character.LOWERCASE_LETTER) }, //$NON-NLS-1$ - { "Lt", new LazyCategory(Character.TITLECASE_LETTER) }, //$NON-NLS-1$ - { "Lm", new LazyCategory(Character.MODIFIER_LETTER) }, //$NON-NLS-1$ - { "Lo", new LazyCategory(Character.OTHER_LETTER) }, //$NON-NLS-1$ - { "IsM", new LazyCategoryScope(0x1C0) }, //$NON-NLS-1$ - { "Mn", new LazyCategory(Character.NON_SPACING_MARK) }, //$NON-NLS-1$ - { "Me", new LazyCategory(Character.ENCLOSING_MARK) }, //$NON-NLS-1$ - { "Mc", new LazyCategory(Character.COMBINING_SPACING_MARK) }, //$NON-NLS-1$ - { "N", new LazyCategoryScope(0xE00) }, //$NON-NLS-1$ - { "Nd", new LazyCategory(Character.DECIMAL_DIGIT_NUMBER) }, //$NON-NLS-1$ - { "Nl", new LazyCategory(Character.LETTER_NUMBER) }, //$NON-NLS-1$ - { "No", new LazyCategory(Character.OTHER_NUMBER) }, //$NON-NLS-1$ - { "IsZ", new LazyCategoryScope(0x7000) }, //$NON-NLS-1$ - { "Zs", new LazyCategory(Character.SPACE_SEPARATOR) }, //$NON-NLS-1$ - { "Zl", new LazyCategory(Character.LINE_SEPARATOR) }, //$NON-NLS-1$ - { "Zp", new LazyCategory(Character.PARAGRAPH_SEPARATOR) }, //$NON-NLS-1$ - { "IsC", new LazyCategoryScope(0xF0000) }, //$NON-NLS-1$ - { "Cc", new LazyCategory(Character.CONTROL) }, //$NON-NLS-1$ - { "Cf", new LazyCategory(Character.FORMAT) }, //$NON-NLS-1$ - { "Co", new LazyCategory(Character.PRIVATE_USE) }, //$NON-NLS-1$ - { "Cs", new LazyCategory(Character.SURROGATE) }, //$NON-NLS-1$ - { "IsP", new LazyCategoryScope(0xF8000) }, //$NON-NLS-1$ - { "Pd", new LazyCategory(Character.DASH_PUNCTUATION) }, //$NON-NLS-1$ - { "Ps", new LazyCategory(Character.START_PUNCTUATION) }, //$NON-NLS-1$ - { "Pe", new LazyCategory(Character.END_PUNCTUATION) }, //$NON-NLS-1$ - { "Pc", new LazyCategory(Character.CONNECTOR_PUNCTUATION) }, //$NON-NLS-1$ - { "Po", new LazyCategory(Character.OTHER_PUNCTUATION) }, //$NON-NLS-1$ - { "IsS", new LazyCategoryScope(0x7E000000) }, //$NON-NLS-1$ - { "Sm", new LazyCategory(Character.MATH_SYMBOL) }, //$NON-NLS-1$ - { "Sc", new LazyCategory(Character.CURRENCY_SYMBOL) }, //$NON-NLS-1$ - { "Sk", new LazyCategory(Character.MODIFIER_SYMBOL) }, //$NON-NLS-1$ - { "So", new LazyCategory(Character.OTHER_SYMBOL) }, //$NON-NLS-1$ - { "Pi", new LazyCategory(Character.INITIAL_QUOTE_PUNCTUATION) }, //$NON-NLS-1$ - { "Pf", new LazyCategory(Character.FINAL_QUOTE_PUNCTUATION) } }; //$NON-NLS-1$ - + { "Cn", new LazyCategory(Character.UNASSIGNED, true) }, + { "IsL", new LazyCategoryScope(0x3E, true) }, + { "Lu", new LazyCategory(Character.UPPERCASE_LETTER, true) }, + { "Ll", new LazyCategory(Character.LOWERCASE_LETTER, true) }, + { "Lt", new LazyCategory(Character.TITLECASE_LETTER, false) }, + { "Lm", new LazyCategory(Character.MODIFIER_LETTER, false) }, + { "Lo", new LazyCategory(Character.OTHER_LETTER, true) }, + { "IsM", new LazyCategoryScope(0x1C0, true) }, + { "Mn", new LazyCategory(Character.NON_SPACING_MARK, true) }, + { "Me", new LazyCategory(Character.ENCLOSING_MARK, false) }, + { "Mc", new LazyCategory(Character.COMBINING_SPACING_MARK, true) }, + { "N", new LazyCategoryScope(0xE00, true) }, + { "Nd", new LazyCategory(Character.DECIMAL_DIGIT_NUMBER, true) }, + { "Nl", new LazyCategory(Character.LETTER_NUMBER, true) }, + { "No", new LazyCategory(Character.OTHER_NUMBER, true) }, + { "IsZ", new LazyCategoryScope(0x7000, false) }, + { "Zs", new LazyCategory(Character.SPACE_SEPARATOR, false) }, + { "Zl", new LazyCategory(Character.LINE_SEPARATOR, false) }, + { "Zp", new LazyCategory(Character.PARAGRAPH_SEPARATOR, false) }, + { "IsC", new LazyCategoryScope(0xF0000, true, true) }, + { "Cc", new LazyCategory(Character.CONTROL, false) }, + { "Cf", new LazyCategory(Character.FORMAT, true) }, + { "Co", new LazyCategory(Character.PRIVATE_USE, true) }, + { "Cs", new LazyCategory(Character.SURROGATE, false, true) }, + { "IsP", new LazyCategoryScope(0xF8000, true) }, + { "Pd", new LazyCategory(Character.DASH_PUNCTUATION, false) }, + { "Ps", new LazyCategory(Character.START_PUNCTUATION, false) }, + { "Pe", new LazyCategory(Character.END_PUNCTUATION, false) }, + { "Pc", new LazyCategory(Character.CONNECTOR_PUNCTUATION, false) }, + { "Po", new LazyCategory(Character.OTHER_PUNCTUATION, true) }, + { "IsS", new LazyCategoryScope(0x7E000000, true) }, + { "Sm", new LazyCategory(Character.MATH_SYMBOL, true) }, + { "Sc", new LazyCategory(Character.CURRENCY_SYMBOL, false) }, + { "Sk", new LazyCategory(Character.MODIFIER_SYMBOL, false) }, + { "So", new LazyCategory(Character.OTHER_SYMBOL, true) }, + { "Pi", new LazyCategory(Character.INITIAL_QUOTE_PUNCTUATION, false) }, + { "Pf", new LazyCategory(Character.FINAL_QUOTE_PUNCTUATION, false) } }; public Object[][] getContents() { return contents; } Index: modules/regex/src/main/java/java/util/regex/UCISupplRangeSet.java =================================================================== --- modules/regex/src/main/java/java/util/regex/UCISupplRangeSet.java (revision 0) +++ modules/regex/src/main/java/java/util/regex/UCISupplRangeSet.java (revision 0) @@ -0,0 +1,107 @@ +/* + * Copyright 2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package java.util.regex; + +/** + * Represents node accepting single character from the given char class + * in Unicode case insensitive manner. + * This character can be supplementary (2 chars to represent) or from + * basic multilingual pane (1 char to represent). + */ +class UCISupplRangeSet extends SupplRangeSet{ + + public UCISupplRangeSet(AbstractCharClass cs, AbstractSet next) { + super(cs, next); + } + + public UCISupplRangeSet(AbstractCharClass cc) { + super(cc); + } + + public boolean contains(int ch) { + return chars.contains(Character.toLowerCase(Character.toUpperCase(ch))); + } + + protected String getName() { + return "UCI range:" + (alt ? "^ " : " ") + chars.toString(); + } +}