diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index b415b61..4134dd3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -246,15 +246,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { assertAnalyzesToReuse(a, input, new String[]{expected}); } - // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy - // TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers. + /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */ public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException { - checkRandomData(random, a, iterations, 20); + checkRandomData(random, a, iterations, false); + } + + /** + * utility method for blasting tokenstreams with data to make sure they don't do anything crazy + * @param simple true if only ascii strings will be used (try to avoid) + */ + public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException { + checkRandomData(random, a, iterations, 20, simple); // now test with multiple threads int numThreads = _TestUtil.nextInt(random, 4, 8); Thread threads[] = new Thread[numThreads]; for (int i = 0; i < threads.length; i++) { - threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations); + threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, simple); } for (int i = 0; i < threads.length; i++) { threads[i].start(); @@ -272,11 +279,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { final int iterations; final Random random; final Analyzer a; + final boolean simple; - AnalysisThread(Random random, Analyzer a, int iterations) { + AnalysisThread(Random random, Analyzer a, int iterations, boolean simple) { this.random = random; this.a = a; this.iterations = iterations; + this.simple = simple; } @Override @@ -284,32 +293,36 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { try { // see the part in checkRandomData where it replays the same text again // to verify reproducability/reuse: hopefully this would catch thread hazards. - checkRandomData(random, a, iterations, 20); + checkRandomData(random, a, iterations, 20, simple); } catch (IOException e) { throw new RuntimeException(e); } } }; - public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { - checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean()); + public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException { + checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple); } - public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException { + public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException { for (int i = 0; i < iterations; i++) { String text; - switch(_TestUtil.nextInt(random, 0, 4)) { - case 0: - text = _TestUtil.randomSimpleString(random); - break; - case 1: - text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength); - break; - case 2: - text = _TestUtil.randomHtmlishString(random, maxWordLength); - break; - default: - text = _TestUtil.randomUnicodeString(random, maxWordLength); + if (simple) { + text = random.nextBoolean() ? _TestUtil.randomSimpleString(random) : _TestUtil.randomHtmlishString(random, maxWordLength); + } else { + switch(_TestUtil.nextInt(random, 0, 4)) { + case 0: + text = _TestUtil.randomSimpleString(random); + break; + case 1: + text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength); + break; + case 2: + text = _TestUtil.randomHtmlishString(random, maxWordLength); + break; + default: + text = _TestUtil.randomUnicodeString(random, maxWordLength); + } } if (VERBOSE) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index c6dbdd6..78af830 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -249,6 +249,36 @@ public class _TestUtil { } } + /** + * Returns a String thats "regexpish" (contains lots of operators typically found in regular expressions) + * If you call this enough times, you might get a valid regex! + */ + public static String randomRegexpishString(Random r) { + final int end = r.nextInt(20); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = r.nextInt(11); + if (t == 0) { + buffer[i] = (char) _TestUtil.nextInt(r, 97, 102); + } + else if (1 == t) buffer[i] = '.'; + else if (2 == t) buffer[i] = '?'; + else if (3 == t) buffer[i] = '*'; + else if (4 == t) buffer[i] = '+'; + else if (5 == t) buffer[i] = '('; + else if (6 == t) buffer[i] = ')'; + else if (7 == t) buffer[i] = '-'; + else if (8 == t) buffer[i] = '['; + else if (9 == t) buffer[i] = ']'; + else if (10 == t) buffer[i] = '|'; + } + return new String(buffer, 0, end); + } + private static final String[] HTML_CHAR_ENTITIES = { "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde", "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java index 77f5c95..cb29447 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java @@ -18,12 +18,13 @@ package org.apache.lucene.analysis.pattern; import java.io.IOException; -import java.util.LinkedList; +import java.io.Reader; +import java.io.StringReader; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.lucene.analysis.charfilter.BaseCharFilter; import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.charfilter.BaseCharFilter; /** * CharFilter that uses a regular expression for the target of replace string. @@ -48,147 +49,88 @@ import org.apache.lucene.analysis.CharStream; * @since Solr 1.5 */ public class PatternReplaceCharFilter extends BaseCharFilter { + @Deprecated + public static final int DEFAULT_MAX_BLOCK_CHARS = 10000; private final Pattern pattern; private final String replacement; - private final int maxBlockChars; - private final String blockDelimiters; - public static final int DEFAULT_MAX_BLOCK_CHARS = 10000; - - private LinkedList buffer; - private int nextCharCounter; - private char[] blockBuffer; - private int blockBufferLength; - private String replaceBlockBuffer; - private int replaceBlockBufferOffset; - - public PatternReplaceCharFilter( Pattern pattern, String replacement, CharStream in ){ - this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in ); - } - - public PatternReplaceCharFilter( Pattern pattern, String replacement, - int maxBlockChars, CharStream in ){ - this( pattern, replacement, maxBlockChars, null, in ); - } + private Reader transformedInput; - public PatternReplaceCharFilter( Pattern pattern, String replacement, - String blockDelimiters, CharStream in ){ - this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in ); - } - - public PatternReplaceCharFilter( Pattern pattern, String replacement, - int maxBlockChars, String blockDelimiters, CharStream in ){ - super( in ); + public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) { + super(in); this.pattern = pattern; this.replacement = replacement; - if( maxBlockChars < 1 ) - throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars ); - this.maxBlockChars = maxBlockChars; - this.blockDelimiters = blockDelimiters; - blockBuffer = new char[maxBlockChars]; - } - - private boolean prepareReplaceBlock() throws IOException { - while( true ){ - if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset ) - return true; - // prepare block buffer - blockBufferLength = 0; - while( true ){ - int c = nextChar(); - if( c == -1 ) break; - blockBuffer[blockBufferLength++] = (char)c; - // end of block? - boolean foundDelimiter = - ( blockDelimiters != null ) && - ( blockDelimiters.length() > 0 ) && - blockDelimiters.indexOf( c ) >= 0; - if( foundDelimiter || - blockBufferLength >= maxBlockChars ) break; - } - // block buffer available? - if( blockBufferLength == 0 ) return false; - replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength ); - replaceBlockBufferOffset = 0; - } } - @Override - public int read() throws IOException { - while( prepareReplaceBlock() ){ - return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ ); - } - return -1; + @Deprecated + public PatternReplaceCharFilter(Pattern pattern, String replacement, + int maxBlockChars, String blockDelimiter, CharStream in) { + this(pattern, replacement, in); } @Override public int read(char[] cbuf, int off, int len) throws IOException { - char[] tmp = new char[len]; - int l = input.read(tmp, 0, len); - if (l != -1) { - for(int i = 0; i < l; i++) - pushLastChar(tmp[i]); - } - l = 0; - for(int i = off; i < off + len; i++) { - int c = read(); - if (c == -1) break; - cbuf[i] = (char) c; - l++; + // Buffer all input on the first call. + if (transformedInput == null) { + StringBuilder buffered = new StringBuilder(); + char [] temp = new char [1024]; + for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) { + buffered.append(temp, 0, cnt); + } + transformedInput = new StringReader(processPattern(buffered).toString()); } - return l == 0 ? -1 : l; - } - private int nextChar() throws IOException { - if (buffer != null && !buffer.isEmpty()) { - nextCharCounter++; - return buffer.removeFirst().charValue(); - } - int c = input.read(); - if( c != -1 ) - nextCharCounter++; - return c; + return transformedInput.read(cbuf, off, len); } - private void pushLastChar(int c) { - if (buffer == null) { - buffer = new LinkedList(); - } - buffer.addLast(new Character((char) c)); - } - - String getReplaceBlock( String block ){ - char[] blockChars = block.toCharArray(); - return getReplaceBlock( blockChars, 0, blockChars.length ); + @Override + protected int correct(int currentOff) { + return Math.max(0, super.correct(currentOff)); } - - String getReplaceBlock( char block[], int offset, int length ){ - StringBuffer replaceBlock = new StringBuffer(); - String sourceBlock = new String( block, offset, length ); - Matcher m = pattern.matcher( sourceBlock ); - int lastMatchOffset = 0, lastDiff = 0; - while( m.find() ){ - m.appendReplacement( replaceBlock, replacement ); - // record cumulative diff for the offset correction - int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset ); - if (diff != 0) { - int prevCumulativeDiff = getLastCumulativeDiff(); - if (diff > 0) { - for(int i = 0; i < diff; i++){ - addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff, - prevCumulativeDiff - 1 - i); - } + + /** + * Replace pattern in input and mark correction offsets. + */ + CharSequence processPattern(CharSequence input) { + final Matcher m = pattern.matcher(input); + + final StringBuffer cumulativeOutput = new StringBuffer(); + int cumulative = 0; + int lastMatchEnd = 0; + while (m.find()) { + final int groupSize = m.end() - m.start(); + final int skippedSize = m.start() - lastMatchEnd; + lastMatchEnd = m.end(); + + final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize; + m.appendReplacement(cumulativeOutput, replacement); + // Matcher doesn't tell us how many characters have been appended before the replacement. + // So we need to calculate it. Skipped characters have been added as part of appendReplacement. + final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement; + + if (groupSize != replacementSize) { + if (replacementSize < groupSize) { + // The replacement is smaller. + // Add the 'backskip' to the next index after the replacement (this is possibly + // after the end of string, but it's fine -- it just means the last character + // of the replaced block doesn't reach the end of the original string. + cumulative += groupSize - replacementSize; + int atIndex = lengthBeforeReplacement + replacementSize; + // System.err.println(atIndex + "!" + cumulative); + addOffCorrectMap(atIndex, cumulative); } else { - addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff, - prevCumulativeDiff - diff); + // The replacement is larger. Every new index needs to point to the last + // element of the original group (if any). + for (int i = groupSize; i < replacementSize; i++) { + addOffCorrectMap(lengthBeforeReplacement + i, --cumulative); + // System.err.println((lengthBeforeReplacement + i) + " " + cumulative); + } } } - // save last offsets - lastMatchOffset = m.end( 0 ); - lastDiff = diff; } - // copy remaining of the part of source block - m.appendTail( replaceBlock ); - return replaceBlock.toString(); + + // Append the remaining output, no further changes to indices. + m.appendTail(cumulativeOutput); + return cumulativeOutput; } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java index f05c5aa..7a6a18c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -29,12 +30,107 @@ import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util._TestUtil; /** * Tests {@link PatternReplaceCharFilter} */ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { - + public void testFailingDot() throws IOException { + checkOutput( + "A. .B.", "\\.[\\s]*", ".", + "A..B.", + "A..B."); + } + + public void testLongerReplacement() throws IOException { + checkOutput( + "XXabcZZabcYY", "abc", "abcde", + "XXabcdeZZabcdeYY", + "XXabcccZZabcccYY"); + checkOutput( + "XXabcabcYY", "abc", "abcde", + "XXabcdeabcdeYY", + "XXabcccabcccYY"); + checkOutput( + "abcabcYY", "abc", "abcde", + "abcdeabcdeYY", + "abcccabcccYY"); + checkOutput( + "YY", "^", "abcde", + "abcdeYY", + // Should be: "-----YY" but we're enforcing non-negative offsets. + "YYYYYYY"); + checkOutput( + "YY", "$", "abcde", + "YYabcde", + "YYYYYYY"); + checkOutput( + "XYZ", ".", "abc", + "abcabcabc", + "XXXYYYZZZ"); + checkOutput( + "XYZ", ".", "$0abc", + "XabcYabcZabc", + "XXXXYYYYZZZZ"); + } + + public void testShorterReplacement() throws IOException { + checkOutput( + "XXabcZZabcYY", "abc", "xy", + "XXxyZZxyYY", + "XXabZZabYY"); + checkOutput( + "XXabcabcYY", "abc", "xy", + "XXxyxyYY", + "XXababYY"); + checkOutput( + "abcabcYY", "abc", "xy", + "xyxyYY", + "ababYY"); + checkOutput( + "abcabcYY", "abc", "", + "YY", + "YY"); + checkOutput( + "YYabcabc", "abc", "", + "YY", + "YY"); + } + + private void checkOutput(String input, String pattern, String replacement, + String expectedOutput, String expectedIndexMatchedOutput) throws IOException { + CharStream cs = new PatternReplaceCharFilter(pattern(pattern), replacement, + CharReader.get(new StringReader(input))); + + StringBuilder output = new StringBuilder(); + for (int chr = cs.read(); chr > 0; chr = cs.read()) { + output.append((char) chr); + } + + StringBuilder indexMatched = new StringBuilder(); + for (int i = 0; i < output.length(); i++) { + indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i)))); + } + + boolean outputGood = expectedOutput.equals(output.toString()); + boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString()); + + if (!outputGood || !indexMatchedGood || false) { + System.out.println("Pattern : " + pattern); + System.out.println("Replac. : " + replacement); + System.out.println("Input : " + input); + System.out.println("Output : " + output); + System.out.println("Expected: " + expectedOutput); + System.out.println("Output/i: " + indexMatched); + System.out.println("Expected: " + expectedIndexMatchedOutput); + System.out.println(); + } + + assertTrue("Output doesn't match.", outputGood); + assertTrue("Index-matched output doesn't match.", indexMatchedGood); + } + // 1111 // 01234567890123 // this is test. @@ -142,9 +238,13 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { // 012345678901234567890123456789012345678 // aa bb cc --- aa bb aa. bb aa bb cc // aa##bb cc --- aa##bb aa. bb aa##bb cc + + // aa bb cc --- aa bbbaa. bb aa b cc + public void test2blocksMultiMatches() throws IOException { final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", + + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", CharReader.get( new StringReader( BLOCK ) ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, @@ -160,10 +260,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { // aa b - c . --- b aa . c c b public void testChain() throws IOException { final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".", + CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", CharReader.get( new StringReader( BLOCK ) ) ); - cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); - cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); + cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs ); + cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, @@ -178,18 +278,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { - Analyzer a = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, tokenizer); - } + int numPatterns = atLeast(100); + for (int i = 0; i < numPatterns; i++) { + final Pattern p = randomPattern(); + final String replacement = _TestUtil.randomSimpleString(random); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, tokenizer); + } - @Override - protected Reader initReader(Reader reader) { - return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader)); + @Override + protected Reader initReader(Reader reader) { + return new PatternReplaceCharFilter(p, replacement, CharReader.get(reader)); + } + }; + checkRandomData(random, a, 1000*RANDOM_MULTIPLIER, true); // only ascii + } + } + + public static Pattern randomPattern() { + while (true) { + try { + return Pattern.compile(_TestUtil.randomRegexpishString(random)); + } catch (PatternSyntaxException ignored) { + // if at first you don't succeed... } - }; - checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); + } } -} + } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index 882b133..d40296e 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -107,7 +107,7 @@ public class WFSTCompletionLookup extends Lookup { Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - + boolean success = false; try { byte [] buffer = new byte [0];