Index: src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (revision 831699) +++ src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (working copy) @@ -57,8 +57,7 @@ * 'a'. */ public final class ASCIIFoldingFilter extends TokenFilter { - public ASCIIFoldingFilter(TokenStream input) - { + public ASCIIFoldingFilter(TokenStream input) { super(input); termAtt = addAttribute(TermAttribute.class); } @@ -67,6 +66,7 @@ private int outputPos; private TermAttribute termAtt; + /** {@inheritDoc} */ @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { @@ -77,8 +77,7 @@ // just return token as-is: for(int i = 0 ; i < length ; ++i) { final char c = buffer[i]; - if (c >= '\u0080') - { + if (c >= '\u0080') { foldToASCII(buffer, length); termAtt.setTermBuffer(output, 0, outputPos); break; @@ -96,19 +95,59 @@ * @param input The string to fold * @param length The number of characters in the input string */ - public void foldToASCII(char[] input, int length) - { + protected void foldToASCII(char[] input, int length) { // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { output = new char[ArrayUtil.getNextSize(maxSizeNeeded)]; } - outputPos = 0; - - for (int pos = 0 ; pos < length ; ++pos) { + outputPos = foldToASCII(input, output, 0); + } + + /** + * Converts characters above ASCII to their ASCII equivalents. For example, + * accents are removed from accented characters. + * @param input The characters to fold + * @param output The result of the folding. Should be of size >= {@code 4*input.length + outputPos}. + * @return new position ({@code outputPos}) in {@code ouput} + */ + public static final int foldToASCII(char[] input, char[] output, int outputPos) { + assert output.length >= 4*input.length + outputPos; + for (int pos = 0 ; pos < input.length ; ++pos) { final char c = input[pos]; + outputPos = foldToASCII(c, output, outputPos); + } + return outputPos; + } + /** + * Converts characters above ASCII to their ASCII equivalents. For example, + * accents are removed from accented characters. + * @param input The CharSequence to fold + * @return The result of the folding + */ + public static String foldToASCII(CharSequence input) { + int length = input.length(); + // Worst-case length required: + final int maxSizeNeeded = 4 * length; + char[] output = new char[maxSizeNeeded]; + int outputPos = 0; + for (int pos = 0 ; pos < length ; ++pos) { + final char c = input.charAt(pos); + outputPos = foldToASCII(c, output, outputPos); + } + return new String(output, 0, outputPos); + } + /** + * Converts characters above ASCII to their ASCII equivalents. For example, + * accents are removed from accented characters. + * @param c The character to fold + * @param output The result of the folding. Should be of size >= {@code outputPos + 4}. + * @param outputPos Where to put the folding result in {@code output} + * @return new position ({@code outputPos}) in {@code ouput} + */ + public static final int foldToASCII(char c, char[] output, int outputPos) { // Quick test: if it's not in range then just keep current character if (c < '\u0080') { output[outputPos++] = c; @@ -2025,7 +2064,7 @@ output[outputPos++] = c; break; } - } } + return outputPos; } }