diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java index 77f5c95..344069a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java @@ -156,39 +156,45 @@ public class PatternReplaceCharFilter extends BaseCharFilter { } buffer.addLast(new Character((char) c)); } - - String getReplaceBlock( String block ){ - char[] blockChars = block.toCharArray(); - return getReplaceBlock( blockChars, 0, blockChars.length ); - } - - String getReplaceBlock( char block[], int offset, int length ){ - StringBuffer replaceBlock = new StringBuffer(); - String sourceBlock = new String( block, offset, length ); - Matcher m = pattern.matcher( sourceBlock ); - int lastMatchOffset = 0, lastDiff = 0; - while( m.find() ){ - m.appendReplacement( replaceBlock, replacement ); - // record cumulative diff for the offset correction - int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset ); - if (diff != 0) { - int prevCumulativeDiff = getLastCumulativeDiff(); - if (diff > 0) { - for(int i = 0; i < diff; i++){ - addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff, - prevCumulativeDiff - 1 - i); - } + + String getReplaceBlock(CharSequence input) { + final Matcher m = pattern.matcher(input); + + final StringBuffer cumulativeOutput = new StringBuffer(); + int cumulative = 0; + int lastMatchEnd = 0; + while (m.find()) { + final int groupSize = m.end() - m.start(); + final int skippedSize = m.start() - lastMatchEnd; + lastMatchEnd = m.end(); + + final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize; + m.appendReplacement(cumulativeOutput, replacement); + // Matcher doesn't tell us how many characters have been appended before the replacement. + // So we need to calculate it. Skipped characters have been added as part of appendReplacement. + final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement; + + if (groupSize != replacementSize) { + if (replacementSize < groupSize) { + // The replacement is smaller. Only add the 'skip' over indexes gone now. + cumulative += groupSize - replacementSize; + addOffCorrectMap(lengthBeforeReplacement, cumulative); + System.err.println((lengthBeforeReplacement + 1) + " " + (groupSize - replacementSize)); } else { - addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff, - prevCumulativeDiff - diff); + // The replacement is larger. Every new index needs to point to the last + // element of the original group (if any). + for (int i = groupSize; i < replacementSize; i++) { + addOffCorrectMap(lengthBeforeReplacement + i, --cumulative); + } } } - // save last offsets - lastMatchOffset = m.end( 0 ); - lastDiff = diff; } - // copy remaining of the part of source block - m.appendTail( replaceBlock ); - return replaceBlock.toString(); + // Append the remaining output, no further changes to indices. + m.appendTail(cumulativeOutput); + return cumulativeOutput.toString(); + } + + String getReplaceBlock(char block[], int offset, int length) { + return getReplaceBlock(new String(block, offset, length)); } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java index f05c5aa..c50212f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java @@ -34,7 +34,68 @@ import org.apache.lucene.analysis.Tokenizer; * Tests {@link PatternReplaceCharFilter} */ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { + public void testFailingDot() throws IOException { + final String BLOCK = "A. .B."; + Pattern DOTSPACE = Pattern.compile("\\.[\\s]*"); + + CharStream cs = CharReader.get(new StringReader(BLOCK)); + cs = new PatternReplaceCharFilter(DOTSPACE, ".", cs); + + StringBuilder processed = new StringBuilder(); + for (int chr = cs.read(); chr > 0; chr = cs.read()) { + processed.append((char) chr); + } + + System.out.println(BLOCK); + System.out.println(processed); + + for (int i = 0; i < processed.length(); i++) { + System.out.print((cs.correctOffset(i) < 0 ? "-" : BLOCK.charAt(cs.correctOffset(i)))); + } + System.out.println(); + + for (int i = 0; i < processed.length(); i++) { + System.out.print(i + " " + cs.correctOffset(i) + " "); + System.out.println( + processed.charAt(i) + " => " + + (cs.correctOffset(i) < 0 ? "--" : BLOCK.charAt(cs.correctOffset(i)))); + } + } + public void testDotAtEnd() throws IOException { + /* + final String BLOCK = "A. .B."; + Pattern DOTSPACE = Pattern.compile("\\.[\\s]*"); + + CharStream cs = CharReader.get(new StringReader(BLOCK)); + cs = new PatternReplaceCharFilter(DOTSPACE, ".", cs); + */ + + final String BLOCK = " aa0bb1cc XX aa012bb345ccXaa012345bb6ccX"; + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)[0-9]+(bb)[0-9]+(cc)"), "$1--$2--$3", + CharReader.get( new StringReader( BLOCK ) ) ); + + StringBuilder processed = new StringBuilder(); + for (int chr = cs.read(); chr > 0; chr = cs.read()) { + processed.append((char) chr); + } + + System.out.println(BLOCK); + System.out.println(processed); + + for (int i = 0; i < processed.length(); i++) { + System.out.print((cs.correctOffset(i) < 0 ? "-" : BLOCK.charAt(cs.correctOffset(i)))); + } + System.out.println(); + + for (int i = 0; i < processed.length(); i++) { + System.out.print(i + " " + cs.correctOffset(i) + " "); + System.out.println( + processed.charAt(i) + " => " + + (cs.correctOffset(i) < 0 ? "--" : BLOCK.charAt(cs.correctOffset(i)))); + } + } + // 1111 // 01234567890123 // this is test. diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index 882b133..d40296e 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -107,7 +107,7 @@ public class WFSTCompletionLookup extends Lookup { Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - + boolean success = false; try { byte [] buffer = new byte [0];