Index: src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java =================================================================== --- src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java (revision 393849) +++ src/test/java/org/apache/harmony/tests/java/util/regex/PatternTest.java (working copy) @@ -152,17 +152,215 @@ //bug6544 } - public void testPattern() { } public void testFlags() { + String baseString; + String testString; + Pattern pat; + Matcher mat; + + baseString = "((?i)|b)a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|b"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "c|(?i)a|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?s)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?-i)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|(?-i)c|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|(?-i)c|(?i)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?-i)b"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "((?i))a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "|(?i)|a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?s)a.)"; + testString = "A\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?-i)a)"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)(?s:a.)"; + testString = "A\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)fgh(?s:aa)"; + testString = "fghAA"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?-i))a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc(?i)d"; + testString = "ABCD"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "abcD"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "a(?i)a(?-i)a(?i)a(?-i)a"; + testString = "aAaAa"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "aAAAa"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); +} + + public void testFlagsMethod() { + String baseString; + Pattern pat; + + /* + * These tests are for compatibility with RI only. + * Logically we have to return only flags specified + * during the compilation. For example + * pat.flags() == 0 when we compile + * Pattern pat = Pattern.compile("(?i)abc(?-i)"); + * but the whole expression is compiled in a case insensitive + * manner. So there is little sense to do + * calls to flags() now. + */ + baseString ="(?-i)"; + pat = Pattern.compile(baseString); + + baseString = "(?idmsux)abc(?-i)vg(?-dmu)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.COMMENTS); + + baseString = "(?idmsux)abc|(?-i)vg|(?-dmu)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.COMMENTS); + + baseString = "(?is)a((?x)b.)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + + baseString = "(?i)a((?-i))"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.CASE_INSENSITIVE); + + baseString = "((?i)a)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), 0); + + pat = Pattern.compile("(?is)abc"); + assertEquals(pat.flags(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL); } /* * Class under test for Pattern compile(String, int) */ public void testCompileStringint() { + + /* + * this tests are needed to verify that + * appropriate exceptions are thrown + */ + String pattern = "b)a"; + try { + Pattern pat = Pattern.compile(pattern); + } catch (PatternSyntaxException e) { + System.out.println(e); + } + pattern = "bcde)a"; + try { + Pattern pat = Pattern.compile(pattern); + } catch (PatternSyntaxException e) { + System.out.println(e); + } + pattern = "bbg())a"; + try { + Pattern pat = Pattern.compile(pattern); + } catch (PatternSyntaxException e) { + System.out.println(e); + } + + pattern = "cdb(?i))a"; + try { + Pattern pat = Pattern.compile(pattern); + } catch (Exception e) { + System.out.println(e); + } + + /* + * this pattern doesn't match any string, + * but should be compiled anyway + */ + pattern = "(b\\1)a"; + Pattern pat = Pattern.compile(pattern); + } /* @@ -496,6 +694,9 @@ } assertEquals(3, k); + pat = Pattern.compile(".*(.)\\1"); + mat = pat.matcher("saa"); + assertTrue(mat.matches()); } public void _testBackReferences1() { @@ -614,7 +815,214 @@ assertFalse(pat.matcher("cde.log").matches()); } + + public void _testCorrectReplacementBackreferencedJointSet() { + Pattern pat = Pattern.compile("ab(a)*\\1"); + pat = Pattern.compile("abc(cd)fg"); + pat = Pattern.compile("aba*cd"); + pat = Pattern.compile("ab(a)*+cd"); + pat = Pattern.compile("ab(a)*?cd"); + pat = Pattern.compile("ab(a)+cd"); + pat = Pattern.compile(".*(.)\\1"); + pat = Pattern.compile("ab((a)|c|d)e"); + pat = Pattern.compile("abc((a(b))cd)"); + pat = Pattern.compile("ab(a)++cd"); + pat = Pattern.compile("ab(a)?(c)d"); + pat = Pattern.compile("ab(a)?+cd"); + pat = Pattern.compile("ab(a)??cd"); + pat = Pattern.compile("ab(a)??cd"); + pat = Pattern.compile("ab(a){1,3}?(c)d"); + } + + public void testCompilePatternWithTerminatorMark() { + Pattern pat = Pattern.compile("a\u0000\u0000cd"); + Matcher mat = pat.matcher("a\u0000\u0000cd"); + assertTrue(mat.matches()); + } + + public void testAlternations() { + String baseString = "|a|bc"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(""); + + assertTrue(mat.matches()); + + baseString = "a||bc"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a|bc|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a|b|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a(|b|cd)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b||cd)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b|cd|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b|c|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a(?:|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a||||bc"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "(?i-is)|a"; + pat = Pattern.compile(baseString); + mat = pat.matcher("a"); + assertTrue(mat.matches()); + } + + public void testMatchWithGroups() { + String baseString + = "jwkerhjwehrkwjehrkwjhrwkjehrjwkehrjkwhrkwehrkwhrkwrhwkhrwkjehr"; + String pattern = ".*(..).*\\1.*"; + assertTrue(Pattern.compile(pattern).matcher(baseString) + .matches()); + baseString = "saa"; + pattern = ".*(.)\\1"; + assertTrue(Pattern.compile(pattern).matcher(baseString) + .matches()); + assertTrue(Pattern.compile(pattern).matcher(baseString) + .find()); + } + + public void testSplitEmptyCharSequence() { + String s1 = ""; + String[] arr = s1.split(":"); + assertEquals(arr.length, 1); + } + + public void testSplitEndsWithPattern() { + assertEquals(",,".split(",", 3).length, 3); + assertEquals(",,".split(",", 4).length, 3); + + assertEquals(Pattern.compile("o").split("boo:and:foo",5).length, 5); + assertEquals(Pattern.compile("b").split("ab", -1).length, 2); + } + + public void testCaseInsensitiveFlag() { + assertTrue(Pattern.matches("(?i-:AbC)", "ABC")); + } + + public void testEmptyGroups() { + Pattern pat = Pattern.compile("ab(?>)cda"); + Matcher mat = pat.matcher("abcda"); + assertTrue(mat.matches()); + + pat = Pattern.compile("ab()"); + mat = pat.matcher("ab"); + assertTrue(mat.matches()); + + pat = Pattern.compile("abc(?:)(..)"); + mat = pat.matcher("abcgf"); + assertTrue(mat.matches()); + } + public void testCompileNonCaptGroup() { + boolean isCompiled = false; + + try { + Pattern pat = Pattern.compile("(?:)", Pattern.CANON_EQ); + pat = Pattern.compile("(?:)", Pattern.CANON_EQ | Pattern.DOTALL); + pat = Pattern.compile("(?:)", Pattern.CANON_EQ | Pattern.CASE_INSENSITIVE); + pat = Pattern.compile("(?:)", Pattern.CANON_EQ | Pattern.COMMENTS | Pattern.UNIX_LINES); + isCompiled = true; + } catch (PatternSyntaxException e) { + System.out.println(e); + } + assertTrue(isCompiled); + } + + public void testEmbeddedFlags(){ + String baseString = "(?i)((?s)a)"; + String testString = "A"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?x)(?i)(?s)(?d)a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?x)(?i)(?s)(?d)a."; + testString = "a\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc(?x:(?i)(?s)(?d)a.)"; + testString = "abcA\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc((?x)d)(?i)(?s)a"; + testString = "abcdA"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + public void testAltWithFlags() { + boolean isCompiled = false; + + try { + Pattern pat = Pattern.compile("|(?i-xi)|()"); + isCompiled = true; + } catch (PatternSyntaxException e) { + System.out.println(e); + } + assertTrue(isCompiled); + } + + public void testRestoreFlagsAfterGroup() { + String baseString = "abc((?x)d) a"; + String testString = "abcd a"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(testString); + + assertTrue(mat.matches()); + } + + public static void main(String[] args) { junit.textui.TestRunner.run(PatternTest.class); } Index: src/main/java/java/util/regex/BackReferencedSingleSet.java =================================================================== --- src/main/java/java/util/regex/BackReferencedSingleSet.java (revision 0) +++ src/main/java/java/util/regex/BackReferencedSingleSet.java (revision 0) @@ -0,0 +1,108 @@ +/* + * Copyright 2005-2006 The Apache Software Foundation or its licensors, as applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay Kuznetsov + * @version $Revision: 1.1 $ + */ + +package java.util.regex; + +/** + * Group node over subexpression w/o alternations. + * This node is used if current group is referenced + * via backreference. + */ + +class BackReferencedSingleSet extends SingleSet { + + /* + * This class is needed only for overwriting find() + * and findBack() methods of SingleSet class, which is being + * back referenced. The following example explains the need + * for such substitution: + * Let's consider the pattern ".*(.)\\1". + * Leading .* works as follows: finds line terminator and runs findBack + * from that point. findBack method in its turn (in contrast to matches) + * sets group boundaries on the back trace. Thus at the point we + * try to match back reference(\\1) groups are not yet set. + * + * To fix this problem we replace backreferenced groups with instances of + * this class, which will use matches instead of find; this will affect + * perfomance, but ensure correctness of the match. + */ + + public BackReferencedSingleSet(AbstractSet child, FSet fSet) { + super(child, fSet); + } + + public BackReferencedSingleSet(SingleSet node) { + super(node.kid, ((FSet) node.fSet)); + } + + public int find(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int res = 0; + int lastIndex = matchResult.getRightBound(); + int startSearch = stringIndex; + + for (; startSearch <= lastIndex; startSearch++) { + int saveStart = matchResult.getStart(groupIndex); + + matchResult.setStart(groupIndex, startSearch); + res = kid.matches(startSearch, testString, matchResult); + if (res >= 0) { + res = startSearch; + break; + } else { + matchResult.setStart(groupIndex, saveStart); + } + } + + return res; + } + + public int findBack(int stringIndex, int lastIndex, + CharSequence testString, MatchResultImpl matchResult) { + int res = 0; + int startSearch = lastIndex; + + for (; startSearch >= stringIndex; startSearch--) { + int saveStart = matchResult.getStart(groupIndex); + + matchResult.setStart(groupIndex, startSearch); + res = kid.matches(startSearch, testString, matchResult); + if (res >= 0) { + res = startSearch; + break; + } else { + matchResult.setStart(groupIndex, saveStart); + } + } + + return res; + } + + /** + * This method is used for replacement backreferenced + * sets. + * + * @param prev - node who references to this node + */ + public JointSet processBackRefReplacement() { + return null; + } +} Index: src/main/java/java/util/regex/Lexer.java =================================================================== --- src/main/java/java/util/regex/Lexer.java (revision 393849) +++ src/main/java/java/util/regex/Lexer.java (working copy) @@ -121,8 +121,6 @@ // when in literal mode, this field will save the previous one private int saved_mode = 0; - private int length = 0; - // previous char read private int lookBack; @@ -131,6 +129,9 @@ //next character private int lookAhead; + + //index of last char in pattern plus one + private int patternFullLength = 0; // cur special token private SpecialToken curST = null; @@ -162,10 +163,11 @@ } this.pattern = new char[pattern.length() + 2]; - System.arraycopy(pattern.toCharArray(), 0, this.pattern, 0, pattern - .length()); - this.pattern[this.pattern.length - 1] = this.pattern[this.pattern.length - 2] = 0; - + System.arraycopy(pattern.toCharArray(), 0, this.pattern, 0, + pattern.length()); + this.pattern[this.pattern.length - 1] = 0; + this.pattern[this.pattern.length - 2] = 0; + patternFullLength = this.pattern.length; this.flags = flags; // read first two tokens; movePointer(); @@ -200,13 +202,20 @@ } } - public void setFlags(int flags) { - if (((this.flags ^ flags) & Pattern.COMMENTS) != 0) { - this.flags = flags; - reread(); - } - + /** + * Restores flags for Lexer + * + * @param flags + */ + public void restoreFlags(int flags) { this.flags = flags; + lookAhead = ch; + lookAheadST = curST; + + //curToc is an index of closing bracket ) + index = curToc + 1; + lookAheadToc = curToc; + movePointer(); } public SpecialToken peekSpecial() { @@ -310,9 +319,10 @@ switch (lookAhead) { case 'E': { - mode = saved_mode; - lookAhead = (index < pattern.length - 2) ? pattern[nextIndex()] - : 0; + mode = saved_mode; + lookAhead = (index <= pattern.length - 2) + ? pattern[nextIndex()] + : 0; break; } @@ -541,16 +551,26 @@ } default: { lookAhead = readFlags(); - if (lookAhead >= 128) { - lookAhead = (lookAhead & 0x7f) << 16; - flags = lookAhead; + + /* + * We return res = res | 1 << 8 + * from readFlags() if we read + * (?idmsux-idmsux) + */ + if (lookAhead >= 256) { + + //Erase auxiliaury bit + lookAhead = (lookAhead & 0xff); + flags = lookAhead; + lookAhead = lookAhead << 16; lookAhead = CHAR_FLAGS | lookAhead; } else { + flags = lookAhead; lookAhead = lookAhead << 16; lookAhead = CHAR_NONCAP_GROUP - | lookAhead; + | lookAhead; } - break; + break; } } } else { @@ -737,7 +757,7 @@ * @return true if there are no more characters in the pattern. */ public boolean isEmpty() { - return ch == 0 && lookAhead == 0 && !isSpecial(); + return ch == 0 && lookAhead == 0 && index == patternFullLength && !isSpecial(); } /** @@ -818,56 +838,73 @@ char ch; boolean pos = true; int res = flags; - int neg = 0; + while (index < pattern.length) { ch = pattern[index]; switch (ch) { - case '-': { - if (!pos) + case '-': + if (!pos) { throw new PatternSyntaxException("Illegal " + "inline construct", this.toString(), index); + } pos = false; - } - + break; + case 'i': - res = pos ? res | Pattern.CASE_INSENSITIVE : res - ^ Pattern.CASE_INSENSITIVE & res; + res = pos + ? res | Pattern.CASE_INSENSITIVE + : (res ^ Pattern.CASE_INSENSITIVE) & res; break; + case 'd': - res = pos ? res | Pattern.UNIX_LINES : res ^ Pattern.UNIX_LINES - & res; + res = pos + ? res | Pattern.UNIX_LINES + : (res ^ Pattern.UNIX_LINES) & res; break; + case 'm': - res = pos ? res | Pattern.MULTILINE : res ^ Pattern.MULTILINE - & res; + res = pos + ? res | Pattern.MULTILINE + : (res ^ Pattern.MULTILINE) & res; break; + case 's': - res = pos ? res | Pattern.DOTALL : res ^ Pattern.DOTALL & res; + res = pos + ? res | Pattern.DOTALL + : (res ^ Pattern.DOTALL) & res; break; + case 'u': - res = pos ? res | Pattern.UNICODE_CASE : res - ^ Pattern.UNICODE_CASE & res; + res = pos + ? res | Pattern.UNICODE_CASE + : (res ^ Pattern.UNICODE_CASE) & res; break; + case 'x': - res = pos ? res | Pattern.COMMENTS : res ^ Pattern.COMMENTS - & res; + res = pos + ? res | Pattern.COMMENTS + : (res ^ Pattern.COMMENTS) & res; break; + case ':': nextIndex(); return res; + case ')': nextIndex(); - return res | (1 << 7); + return res | (1 << 8); + default: throw new PatternSyntaxException("Illegal inline construct", - this.toString(), index); + this.toString(), index); } nextIndex(); } - throw new PatternSyntaxException("Illegal inline construct", this - .toString(), index); + throw new PatternSyntaxException("Illegal inline construct", + this.toString(), index); } + /** * Returns next character index to read and moves pointer to the next one. * If comments flag is on this method will skip comments and whitespaces. Index: src/main/java/java/util/regex/JointSet.java =================================================================== --- src/main/java/java/util/regex/JointSet.java (revision 393849) +++ src/main/java/java/util/regex/JointSet.java (working copy) @@ -96,4 +96,56 @@ return !(matchResult.getEnd(groupIndex) >= 0 && matchResult .getStart(groupIndex) == matchResult.getEnd(groupIndex)); } + + /** + * This method is used for traversing nodes after the + * first stage of compilation. + */ + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (fSet != null && !fSet.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + + /* + * End code to do during the pass + */ + fSet.processSecondPass(); + } + + if (children != null) { + int childrenSize = children.size(); + + for (int i = 0; i < childrenSize; i++) { + AbstractSet child = (AbstractSet) children.get(i); + + /* + * Add here code to do during the pass + */ + + JointSet set = child.processBackRefReplacement(); + + if (set != null) { + child.isSecondPassVisited = true; + children.remove(i); + children.add(i, set); + child = (AbstractSet) set; + } + + /* + * End code to do during the pass + */ + if (!child.isSecondPassVisited) { + child.processSecondPass(); + } + } + } + + if (next != null) { + super.processSecondPass(); + } + } } Index: src/main/java/java/util/regex/Pattern.java =================================================================== --- src/main/java/java/util/regex/Pattern.java (revision 393849) +++ src/main/java/java/util/regex/Pattern.java (working copy) @@ -75,6 +75,8 @@ * @com.intel.drl.spec_ref */ public static final int CANON_EQ = 1 << 7; + + static final int BACK_REF_NUMBER = 10; /** * Current pattern to be compiled; @@ -87,6 +89,16 @@ private int flags = 0; private String pattern = null; + + /* + * All backreferences that may be used in pattern. + */ + transient private FSet backRefs [] = new FSet [BACK_REF_NUMBER]; + + /* + * Is true if backreferenced sets replacement is needed + */ + transient private boolean needsBackRefReplacement = false; transient private int groupIndex = -1; @@ -191,13 +203,13 @@ this.lexemes = new Lexer(regex, flags); this.flags = flags; - start = processExpression(-1, flags, null); + start = processExpression(-1, this.flags, null); if (!lexemes.isEmpty()) { throw new PatternSyntaxException(I18n .getMessage("Trailing characters"), lexemes.toString(), lexemes.getIndex()); } - + finalizeCompile(); return this; } @@ -227,107 +239,123 @@ /** * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)* */ - private AbstractSet processExpression(int ch, int new_flags, + private AbstractSet processExpression(int ch, int newFlags, AbstractSet last) { ArrayList children = new ArrayList(); AbstractSet child; - int safe_flags = flags; + int saveFlags = flags; FSet fSet; + boolean saveChangedFlags = false; - if (new_flags != flags) { - flags = new_flags; - lexemes.setFlags(flags); + if (newFlags != flags) { + flags = newFlags; } switch (ch) { - case Lexer.CHAR_NONCAP_GROUP: { + case Lexer.CHAR_NONCAP_GROUP: fSet = new NonCapFSet(++consCount); break; - } - + case Lexer.CHAR_POS_LOOKAHEAD: - case Lexer.CHAR_NEG_LOOKAHEAD: { + /* falls through */ + + case Lexer.CHAR_NEG_LOOKAHEAD: fSet = new AheadFSet(); break; - } + case Lexer.CHAR_POS_LOOKBEHIND: - case Lexer.CHAR_NEG_LOOKBEHIND: { + /* falls through */ + + case Lexer.CHAR_NEG_LOOKBEHIND: fSet = new BehindFSet(++consCount); break; - } - - case Lexer.CHAR_ATOMIC_GROUP: { + + case Lexer.CHAR_ATOMIC_GROUP: fSet = new AtomicFSet(++consCount); break; - } - - default: { + + default: globalGroupIndex++; if (last == null) { - fSet = new FinalSet(); - // expr = new StartSet(); + + // expr = new StartSet(); + fSet = new FinalSet(); + saveChangedFlags = true; } else { - fSet = new FSet(globalGroupIndex); - // expr = new JointSet(globalGroupIndex); + + // expr = new JointSet(globalGroupIndex); + fSet = new FSet(globalGroupIndex); } + if (globalGroupIndex > -1 && globalGroupIndex < 10) { + backRefs[globalGroupIndex] = fSet; + } + break; } - } do { if (lexemes.isLetter() && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) { child = processAlternations(fSet); + } else if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR){ + child = new EmptySet(fSet); + lexemes.next(); } else { child = processSubExpression(fSet); - if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) + if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) { lexemes.next(); + } } - if (child != null) - children.add(child); - // expr.addChild(child); - } while (!(lexemes.isEmpty() || lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS)); - - if (flags != safe_flags) { - flags = safe_flags; - lexemes.setFlags(flags); + if (child != null) { + + //expr.addChild(child); + children.add(child); + } + } while (!(lexemes.isEmpty() + || (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS))); + + if (lexemes.back() == Lexer.CHAR_VERTICAL_BAR) { + children.add(new EmptySet(fSet)); } + + if (flags != saveFlags && !saveChangedFlags) { + flags = saveFlags; + lexemes.restoreFlags(flags); + } switch (ch) { - case Lexer.CHAR_NONCAP_GROUP: { + case Lexer.CHAR_NONCAP_GROUP: return new NonCapJointSet(children, fSet); - } - case Lexer.CHAR_POS_LOOKAHEAD: { - return new PositiveLookAhead(children, fSet); - } + + case Lexer.CHAR_POS_LOOKAHEAD: + return new PositiveLookAhead(children, fSet); - case Lexer.CHAR_NEG_LOOKAHEAD: { + case Lexer.CHAR_NEG_LOOKAHEAD: return new NegativeLookAhead(children, fSet); - } - - case Lexer.CHAR_POS_LOOKBEHIND: { + + case Lexer.CHAR_POS_LOOKBEHIND: return new PositiveLookBehind(children, fSet); - } - - case Lexer.CHAR_NEG_LOOKBEHIND: { + + case Lexer.CHAR_NEG_LOOKBEHIND: return new NegativeLookBehind(children, fSet); - } - - case Lexer.CHAR_ATOMIC_GROUP: { + + case Lexer.CHAR_ATOMIC_GROUP: return new AtomicJointSet(children, fSet); - } - default: { + + default: switch (children.size()) { case 0: return new EmptySet(fSet); + case 1: return new SingleSet((AbstractSet) children.get(0), fSet); + default: return new JointSet(children, fSet); } } - } } + /** * T->a+ */ @@ -363,13 +391,22 @@ if (lexemes.isLetter() && !lexemes.isNextSpecial() && Lexer.isLetter(lexemes.lookAhead())) { cur = processSequence(last); + } else if (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS) { + if (last instanceof FinalSet) { + throw new PatternSyntaxException(I18n + .getMessage("unmatched )"), lexemes.toString(), + lexemes.getIndex()); + } else { + cur = new EmptySet(last); + } } else { cur = processQuantifier(last); } if (!lexemes.isEmpty() // && !pattern.isQuantifier() - && lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS + && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS + || last instanceof FinalSet) && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) { AbstractSet next = processSubExpression(last); if (cur instanceof LeafQuantifierSet @@ -406,8 +443,6 @@ */ private AbstractSet processQuantifier(AbstractSet last) { AbstractSet term = processTerminal(last); - SpecialToken quantifier = null; - int quant = lexemes.peek(); if (term != null && !(term instanceof LeafSet)) { @@ -578,24 +613,25 @@ do { ch = lexemes.peek(); if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) { - lexemes.next(); - int new_flags = (ch & 0x00ff0000) >> 16; - ch = ch & 0xff00ffff; + int newFlags; + lexemes.next(); + newFlags = (ch & 0x00ff0000) >> 16; + ch = ch & 0xff00ffff; - if (ch == Lexer.CHAR_FLAGS) { - flags = new_flags; - lexemes.setFlags(new_flags); - } else { - new_flags = (ch == Lexer.CHAR_NONCAP_GROUP) ? new_flags - : flags; - term = processExpression(ch, new_flags, last); - if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS) - throw new PatternSyntaxException(I18n - .getMessage("unmatched ("), lexemes.toString(), - lexemes.getIndex()); - lexemes.next(); - } - + if (ch == Lexer.CHAR_FLAGS) { + flags = newFlags; + } else { + newFlags = (ch == Lexer.CHAR_NONCAP_GROUP) + ? newFlags + : flags; + term = processExpression(ch, newFlags, last); + if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS) { + throw new PatternSyntaxException(I18n + .getMessage("unmatched ("), lexemes.toString(), + lexemes.getIndex()); + } + lexemes.next(); + } } else switch (ch) { case Lexer.CHAR_LEFT_SQUARE_BRACKET: { @@ -718,6 +754,8 @@ } else { term = new UCIBackReferenceSet(number, consCount); } + (backRefs [number]).isBackReferenced = true; + needsBackRefReplacement = true; break; } else { throw new PatternSyntaxException(I18n @@ -733,6 +771,8 @@ term = new RangeSet(cc); } else if (!lexemes.isEmpty()) { term = new CharSet((char) ch); + } else { + term = new EmptySet(last); } lexemes.next(); break; @@ -754,6 +794,16 @@ term = new CharSet((char) ch); } lexemes.next(); + } else if (ch == Lexer.CHAR_VERTICAL_BAR) { + term = new EmptySet(last); + } else if (ch == Lexer.CHAR_RIGHT_PARENTHESIS) { + if (last instanceof FinalSet) { + throw new PatternSyntaxException(I18n + .getMessage("unmatched )"), lexemes.toString(), + lexemes.getIndex()); + } else { + term = new EmptySet(last); + } } else { throw new PatternSyntaxException(I18n .getMessage("Dangling meta construction") @@ -785,14 +835,12 @@ CharClass res = new CharClass(alt, hasFlag(Pattern.CASE_INSENSITIVE), hasFlag(Pattern.UNICODE_CASE)); int buffer = -1; - // TODO remove this one, being used for debug only - int ch = 0; boolean intersection = false; boolean notClosed = false; boolean firstInClass = true; while (!lexemes.isEmpty() - && (notClosed = (ch = lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET + && (notClosed = (lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET || firstInClass)) { switch (lexemes.peek()) { @@ -917,6 +965,21 @@ return compile(pattern, 0); } + /* + * This method do traverses of + * automata to finish compilation. + */ + private void finalizeCompile() { + + /* + * Processing second pass + */ + if (needsBackRefReplacement) { //|| needsReason1 || needsReason2) { + start.processSecondPass(); + } + + } + /** * @com.intel.drl.spec_ref */ Index: src/main/java/java/util/regex/QuantifierSet.java =================================================================== --- src/main/java/java/util/regex/QuantifierSet.java (revision 393849) +++ src/main/java/java/util/regex/QuantifierSet.java (working copy) @@ -59,4 +59,75 @@ public boolean hasConsumed(MatchResultImpl mr) { return true; } + + /** + * This method is used for traversing nodes after the + * first stage of compilation. + */ + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (next != null) { + + if (!next.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = next.processBackRefReplacement(); + + if (set != null) { + next.isSecondPassVisited = true; + next =(AbstractSet) set; + } + + /* + * End code to do during the pass + */ + next.processSecondPass(); + } + } + + if (innerSet != null) { + + if (!innerSet.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = innerSet.processBackRefReplacement(); + + if (set != null) { + innerSet.isSecondPassVisited = true; + innerSet =(AbstractSet) set; + } + + /* + * End code to do during the pass + */ + innerSet.processSecondPass(); + } else { + + /* + * We reach node through innerSet but it is already traversed. + * You can see this situation for GroupQuantifierSet.innerset + * if we compile smth like "(a)+ when + * GroupQuantifierSet == GroupQuantifierSet.innerset.fSet.next + */ + + /* + * Add here code to do during the pass + */ + if (innerSet instanceof SingleSet + && ((FSet) ((JointSet) innerSet).fSet) + .isBackReferenced) { + innerSet = innerSet.next; + } + + /* + * End code to do during the pass + */ + } + } + } } Index: src/main/java/java/util/regex/SingleSet.java =================================================================== --- src/main/java/java/util/regex/SingleSet.java (revision 393849) +++ src/main/java/java/util/regex/SingleSet.java (working copy) @@ -27,7 +27,7 @@ */ class SingleSet extends JointSet { - private AbstractSet kid; + protected AbstractSet kid; public SingleSet(AbstractSet child, FSet fSet) { this.kid = child; @@ -66,4 +66,62 @@ public boolean first(AbstractSet set) { return kid.first(set); } + + /** + * This method is used for replacement backreferenced + * sets. + */ + public JointSet processBackRefReplacement() { + BackReferencedSingleSet set = new BackReferencedSingleSet(this); + + /* + * We will store a reference to created BackReferencedSingleSet + * in next field. This is needed toprocess replacement + * of sets correctly since sometimes we cannot renew all references to + * detachable set in the current point of traverse. See + * QuantifierSet and AbstractSet processSecondPass() methods for + * more details. + */ + next = set; + return set; + } + + /** + * This method is used for traversing nodes after the + * first stage of compilation. + */ + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (fSet != null && !fSet.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + + /* + * End code to do during the pass + */ + fSet.processSecondPass(); + } + + if (kid != null && !kid.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = kid.processBackRefReplacement(); + + if (set != null) { + kid.isSecondPassVisited = true; + kid = (AbstractSet) set; + } + + /* + * End code to do during the pass + */ + + kid.processSecondPass(); + } + } } Index: src/main/java/java/util/regex/FSet.java =================================================================== --- src/main/java/java/util/regex/FSet.java (revision 393849) +++ src/main/java/java/util/regex/FSet.java (working copy) @@ -29,6 +29,8 @@ static PossessiveFSet posFSet = new PossessiveFSet(); + boolean isBackReferenced = false; + private int groupIndex; public FSet(int groupIndex) { Index: src/main/java/java/util/regex/AbstractSet.java =================================================================== --- src/main/java/java/util/regex/AbstractSet.java (revision 393849) +++ src/main/java/java/util/regex/AbstractSet.java (working copy) @@ -46,6 +46,8 @@ * Counter for debugging purposes, represent unique node index; */ static int counter = 1; + + protected boolean isSecondPassVisited = false; protected String index = new Integer(AbstractSet.counter++).toString(); @@ -193,4 +195,66 @@ public boolean first(AbstractSet set) { return true; } + + /** + * This method is used for replacement backreferenced + * sets. + * + * @param prev - node who references to this node + * @return null if current node need not to be replaced + * JointSet which is replacement of + * current node otherwise + */ + public JointSet processBackRefReplacement() { + return null; + } + + /** + * This method is used for traversing nodes after the + * first stage of compilation. + */ + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (next != null) { + + if (!next.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = next.processBackRefReplacement(); + + if (set != null) { + next.isSecondPassVisited = true; + next =(AbstractSet) set; + } + + /* + * End code to do during the pass + */ + next.processSecondPass(); + } else { + + /* + * We reach node through next but it is already traversed. + * You can see this situation for AltGroupQuantifierSet.next + * when we reach this node through + * AltGroupQuantifierSet.innerset. ... .next + */ + + /* + * Add here code to do during the pass + */ + if (next instanceof SingleSet + && ((FSet) ((JointSet) next).fSet).isBackReferenced) { + next = next.next; + } + + /* + * End code to do during the pass + */ + } + } + } } \ No newline at end of file