Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 566869) +++ CHANGES.txt (working copy) @@ -119,6 +119,9 @@ TokenStream instances when possible to improve tokenization performance (~10-15%). (Mike McCandless) + 9. LUCENE-871: Speedup ISOLatin1AccentFilter (Ian Boston via Mike + McCandless) + Documentation Build Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 566869) +++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy) @@ -35,155 +35,177 @@ public final Token next(Token result) throws java.io.IOException { result = input.next(result); if (result != null) { - outputPos = 0; - removeAccents(result.termBuffer(), result.termLength()); - result.setTermBuffer(output, 0, outputPos); + final char[] buffer = result.termBuffer(); + final int length = result.termLength(); + // If no characters actually require rewriting then we + // just return token as-is: + for(int i=0;i= '\u00c0' && c <= '\u0178') { + removeAccents(buffer, length); + result.setTermBuffer(output, 0, outputPos); + break; + } + } return result; } else return null; } - private final void addChar(char c) { - if (outputPos == output.length) { - char[] newArray = new char[2*output.length]; - System.arraycopy(output, 0, newArray, 0, output.length); - output = newArray; - } - output[outputPos++] = c; - } - /** * To replace accented characters in a String by unaccented equivalents. */ public final void removeAccents(char[] input, int length) { + + // Worst-case length required: + final int maxSizeNeeded = 2*length; + + int size = output.length; + while (size < maxSizeNeeded) + size *= 2; + + if (size != output.length) + output = new char[size]; + + outputPos = 0; + int pos = 0; + for (int i=0; i