Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestToken.java (revision 0) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import junit.framework.*; + +public class TestToken extends TestCase { + + public TestToken(String name) { + super(name); + } + + public void testToString() throws Exception { + char[] b = {'a', 'l', 'o', 'h', 'a'}; + Token t = new Token("", 0, 5); + t.setTermBuffer(b, 0, 5); + assertEquals("(aloha,0,5)", t.toString()); + + t.setTermText("hi there"); + assertEquals("(hi there,0,5)", t.toString()); + } + + public void testMixedStringArray() throws Exception { + Token t = new Token("hello", 0, 5); + assertEquals(t.termText(), "hello"); + assertEquals(t.termLength(), 5); + assertEquals(new String(t.termBuffer(), 0, 5), "hello"); + t.setTermText("hello2"); + assertEquals(t.termLength(), 6); + assertEquals(new String(t.termBuffer(), 0, 6), "hello2"); + t.setTermBuffer("hello3".toCharArray(), 0, 6); + assertEquals(t.termText(), "hello3"); + + // Make sure if we get the buffer and change a character + // that termText() reflects the change + char[] buffer = t.termBuffer(); + buffer[1] = 'o'; + assertEquals(t.termText(), "hollo3"); + } +} Property changes on: src/test/org/apache/lucene/analysis/TestToken.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 560587) +++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy) @@ -94,7 +94,7 @@ Token token; while ((token = stream.next()) != null) { assertTrue(count < tokens.length); - assertEquals(tokens[count], token.termText); + assertEquals(tokens[count], token.termText()); count++; } Index: src/java/org/apache/lucene/analysis/SimpleAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (revision 560587) +++ src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (working copy) @@ -25,4 +25,14 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new LowerCaseTokenizer(reader); } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new LowerCaseTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } } Index: src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java =================================================================== --- src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (revision 560587) +++ src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (working copy) @@ -75,6 +75,14 @@ return analyzer.tokenStream(fieldName, reader); } + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); + if (analyzer == null) + analyzer = defaultAnalyzer; + + return analyzer.reusableTokenStream(fieldName, reader); + } + /** Return the positionIncrementGap from the analyzer assigned to fieldName */ public int getPositionIncrementGap(String fieldName) { Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); Index: src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (revision 560587) +++ src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (working copy) @@ -25,4 +25,14 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new WhitespaceTokenizer(reader); } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new WhitespaceTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 560587) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -28,8 +28,7 @@ private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; - private static final int IO_BUFFER_SIZE = 1024; - private final char[] buffer = new char[MAX_WORD_LEN]; + private static final int IO_BUFFER_SIZE = 4096; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** Returns true iff a character should be included in a token. This @@ -45,31 +44,32 @@ return c; } - /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { int length = 0; - int start = offset; + int start = bufferIndex; + char[] buffer = token.termBuffer(); while (true) { - final char c; - offset++; if (bufferIndex >= dataLen) { + offset += dataLen; dataLen = input.read(ioBuffer); + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } bufferIndex = 0; } - ; - if (dataLen == -1) { - if (length > 0) - break; - else - return null; - } else - c = ioBuffer[bufferIndex++]; + final char c = ioBuffer[bufferIndex++]; + if (isTokenChar(c)) { // if it's a token char if (length == 0) // start of token - start = offset - 1; + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = token.resizeTermBuffer(1+length); buffer[length++] = normalize(c); // buffer it, normalized @@ -78,9 +78,18 @@ } else if (length > 0) // at non-Letter w/ chars break; // return 'em - } - return new Token(new String(buffer, 0, length), start, start + length); + token.termLength = length; + token.startOffset = start; + token.endOffset = start+length; + return token; } + + public void reset(Reader input) { + super.reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + } } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 560587) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -23,6 +23,8 @@ /** A Tokenizer is a TokenStream whose input is a Reader.
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class Tokenizer extends TokenStream {
@@ -41,5 +43,10 @@
public void close() throws IOException {
input.close();
}
+
+ /** Re-set the tokenizer to a new reader. */
+ protected void reset(Reader input) {
+ this.input = input;
+ }
}
Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 560587)
+++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
@@ -45,16 +45,13 @@
stemmer = new PorterStemmer();
}
- /** Returns the next input Token, after being stemmed */
- public final Token next() throws IOException {
- Token token = input.next();
- if (token == null)
+ public final Token next(Token result) throws IOException {
+ result = input.next(result);
+ if (result != null) {
+ if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+ result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
+ return result;
+ } else
return null;
- else {
- String s = stemmer.stem(token.termText);
- if (s != token.termText) // Yes, I mean object reference comparison here
- token.termText = s;
- return token;
- }
}
}
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 560587)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -28,7 +28,6 @@
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
- private final char[] buffer;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
@@ -36,23 +35,23 @@
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
- this.buffer = new char[bufferSize];
this.done = false;
}
- public Token next() throws IOException {
+ public Token next(Token result) throws IOException {
if (!done) {
done = true;
- StringBuffer buffer = new StringBuffer();
- int length;
+ int upto = 0;
+ char[] buffer = result.termBuffer();
while (true) {
- length = input.read(this.buffer);
+ final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
-
- buffer.append(this.buffer, 0, length);
+ upto += length;
+ if (upto == buffer.length)
+ buffer = result.resizeTermBuffer();
}
- String text = buffer.toString();
- return new Token(text, 0, text.length());
+ result.termLength = upto;
+ return result;
}
return null;
}
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 560587)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -1,8 +1,5 @@
package org.apache.lucene.analysis;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.TermPositions;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -45,65 +42,106 @@
supported anymore in such a case.
@see org.apache.lucene.index.Payload
- */
- // TODO: Remove warning after API has been finalized
+
+
+
+
NOTE: As of 2.3, the Token stores the term text + internally as a char[] termBuffer instead of String + termText. This provides better indexing speed as + tokenizers can directly fill in this buffer and filters + can directly modify this buffer, instead of creating a new + String each time. The APIs that accept String termText + are still available but a warning about performance has + been added. The method that gets the term text as a + String has been deprecated.
+ +To create a Token you should first use one of the + constructors that starts with null text. Then you should + call either {@link #termBuffer()} or {@link + #resizeTermBuffer()} to retrieve the termBuffer. Fill in + the characters of your term into this buffer, and call + {@link #setTermLength()} to set the length of the text. + See LUCENE-969 + for details.
+*/ + +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.TermPositions; + +// TODO: Remove warning after API has been finalized public class Token implements Cloneable { - String termText; // the text of the term + + private static final String DEFAULT_TYPE = "word"; + private static int MIN_BUFFER_SIZE = 10; + + /** @deprecated: we will remove this when we remove the + * deprecated APIs */ + private String termText; + + char[] termBuffer; // characters for the term text + int termLength; // length of term text in buffer + int startOffset; // start in source text int endOffset; // end in source text - String type = "word"; // lexical type + String type = DEFAULT_TYPE; // lexical type Payload payload; - // For better indexing speed, use termBuffer (and - // termBufferOffset/termBufferLength) instead of termText - // to save new'ing a String per token - char[] termBuffer; - int termBufferOffset; - int termBufferLength; + int positionIncrement = 1; - private int positionIncrement = 1; + /** Constructs a Token will null text. */ + public Token() { + } - /** Constructs a Token with the given term text, and start & end offsets. - The type defaults to "word." */ - public Token(String text, int start, int end) { - termText = text; + /** Constructs a Token with null text and start & end + * offsets. + * @param start start offset + * @param end end offset */ + public Token(int start, int end) { startOffset = start; endOffset = end; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end offsets. - * The type defaults to "word." */ - public Token(char[] text, int offset, int length, int start, int end) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; + /** Constructs a Token with null text and start & end + * offsets plus the Token type. + * @param start start offset + * @param end end offset */ + public Token(int start, int end, String typ) { startOffset = start; endOffset = end; + type = typ; } - /** Constructs a Token with the given text, start and end offsets, & type. */ - public Token(String text, int start, int end, String typ) { + /** Constructs a Token with the given term text, and start + * & end offsets. The type defaults to "word." + * NOTE: for better indexing speed you should + * instead use the char[] termBuffer methods to set the + * term text. + * @param text term text + * @param start start offset + * @param end end offset */ + public Token(String text, int start, int end) { termText = text; startOffset = start; endOffset = end; - type = typ; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end - * offsets, & type. */ - public Token(char[] text, int offset, int length, int start, int end, String typ) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; + /** Constructs a Token with the given text, start and end + * offsets, & type. NOTE: for better indexing + * speed you should instead use the char[] termBuffer + * methods to set the term text. + * @param text term text + * @param start start offset + * @param end end offset + * @param token type */ + public Token(String text, int start, int end, String typ) { + termText = text; startOffset = start; endOffset = end; type = typ; } - /** Set the position increment. This determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. @@ -139,71 +177,170 @@ /** Returns the position increment of this Token. * @see #setPositionIncrement */ - public int getPositionIncrement() { return positionIncrement; } + public int getPositionIncrement() { + return positionIncrement; + } - /** Sets the Token's term text. */ + /** Sets the Token's term text. NOTE: for better + * indexing speed you should instead use the char[] + * termBuffer methods to set the term text. */ public void setTermText(String text) { termText = text; + termBuffer = null; } - /** Returns the Token's term text. */ - public final String termText() { return termText; } - public final char[] termBuffer() { return termBuffer; } - public final int termBufferOffset() { return termBufferOffset; } - public final int termBufferLength() { return termBufferLength; } + /** Returns the Token's term text. @deprecated Use + * {@link #termBuffer()} and {@link #termLength()} + * instead. */ + public final String termText() { + if (termText == null && termBuffer != null) + termText = new String(termBuffer, 0, termLength); + return termText; + } - public void setStartOffset(int offset) {this.startOffset = offset;} - public void setEndOffset(int offset) {this.endOffset = offset;} - + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. */ public final void setTermBuffer(char[] buffer, int offset, int length) { - this.termBuffer = buffer; - this.termBufferOffset = offset; - this.termBufferLength = length; + resizeTermBuffer(length); + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; } - + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link resizeTermBuffer()} + * to increase it. */ + public final char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize. + * @param newSize minimum size of the new termBuffer + * @returns newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + initTermBuffer(); + if (newSize > termBuffer.length) { + int size = termBuffer.length; + while(size < newSize) + size *= 2; + char[] newBuffer = new char[size]; + System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); + termBuffer = newBuffer; + } + return termBuffer; + } + + /** Increase the size of the term buffer by the default + * growth factor (2X). @see #resizeTermBuffer() */ + public char[] resizeTermBuffer() { + return resizeTermBuffer(1+termBuffer.length); + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + if (termText == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } else { + int length = termText.length(); + if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE; + termBuffer = new char[length]; + termLength = termText.length(); + termText.getChars(0, termText.length(), termBuffer, 0); + termText = null; + } + } else if (termText != null) + termText = null; + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public final int termLength() { + initTermBuffer(); + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. */ + public final void setTermLength(int length) { + initTermBuffer(); + termLength = length; + } + /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ - public final int startOffset() { return startOffset; } + public final int startOffset() { + return startOffset; + } + /** Set the starting offset. @see #startOffset() */ + public void setStartOffset(int offset) { + this.startOffset = offset; + } + /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. */ - public final int endOffset() { return endOffset; } + public final int endOffset() { + return endOffset; + } + /** Set the ending offset. @see #endOffset() */ + public void setEndOffset(int offset) { + this.endOffset = offset; + } + /** Returns this Token's lexical type. Defaults to "word". */ - public final String type() { return type; } + public final String type() { + return type; + } + /** Set the lexical type. @see #type() */ + public final void setType(String type) { + this.type = type; + } + /** - * Sets this Token's payload. + * Returns this Token's payload. ** WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ // TODO: Remove warning after API has been finalized - public void setPayload(Payload payload) { - this.payload = payload; + public Payload getPayload() { + return this.payload; } - + /** - * Returns this Token's payload. + * Sets this Token's payload. *
* WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ // TODO: Remove warning after API has been finalized - public Payload getPayload() { - return this.payload; + public void setPayload(Payload payload) { + this.payload = payload; } - + public String toString() { StringBuffer sb = new StringBuffer(); - sb.append("(" + termText + "," + startOffset + "," + endOffset); + sb.append("("); + initTermBuffer(); + if (termBuffer == null) + sb.append("null"); + else + sb.append(termBuffer, 0, termLength); + sb.append("," + startOffset + "," + endOffset); if (!type.equals("word")) sb.append(",type="+type); if (positionIncrement != 1) @@ -212,11 +349,15 @@ return sb.toString(); } - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen since we implement Cloneable - } + /** Reset all state for this token back to defaults (same + * state that new Token() creates). */ + public void clear() { + payload = null; + termBuffer = null; + termLength = 0; + termText = null; + positionIncrement = 1; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; } } Index: src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (revision 560587) +++ src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (working copy) @@ -69,10 +69,33 @@ /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); - result = new StopFilter(result, stopSet); - return result; + TokenStream tokenStream = new StandardTokenizer(reader); + tokenStream = new StandardFilter(tokenStream); + tokenStream = new LowerCaseFilter(tokenStream); + tokenStream = new StopFilter(tokenStream, stopSet); + return tokenStream; } + + private class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + FastCharStream charStream; + }; + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.charStream = new FastCharStream(reader); + streams.tokenStream = new StandardTokenizer(streams.charStream); + streams.tokenStream.setInput(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else { + streams.charStream.reset(reader); + streams.tokenStream.ReInit(streams.charStream); + } + + return streams.filteredTokenStream; + } } Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 560587) +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -37,33 +37,33 @@ *
Removes 's from the end of words. *
Removes dots from acronyms.
*/
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
- org.apache.lucene.analysis.Token t = input.next();
+ public final org.apache.lucene.analysis.Token next(org.apache.lucene.analysis.Token result) throws java.io.IOException {
+ org.apache.lucene.analysis.Token t = input.next(result);
if (t == null)
return null;
- String text = t.termText();
- String type = t.type();
+ char[] buffer = t.termBuffer();
+ final int bufferLength = t.termLength();
+ final int bufferEnd = bufferLength;
+ final String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's
- (text.endsWith("'s") || text.endsWith("'S"))) {
- return new org.apache.lucene.analysis.Token
- (text.substring(0,text.length()-2),
- t.startOffset(), t.endOffset(), type);
-
+ bufferLength >= 2 &&
+ buffer[bufferEnd-2] == '\'' &&
+ (buffer[bufferEnd-1] == 's' || buffer[bufferEnd-1] == 'S')) {
+ // Strip last 2 characters off
+ t.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
- StringBuffer trimmed = new StringBuffer();
- for (int i = 0; i < text.length(); i++) {
- char c = text.charAt(i);
- if (c != '.')
- trimmed.append(c);
+ int upto = 0;
+ for(int i=0;i
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 560587)
+++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy)
@@ -25,144 +25,165 @@
*
*/
public class ISOLatin1AccentFilter extends TokenFilter {
- public ISOLatin1AccentFilter(TokenStream input) {
- super(input);
- }
+ public ISOLatin1AccentFilter(TokenStream input) {
+ super(input);
+ }
- public final Token next() throws java.io.IOException {
- final Token t = input.next();
- if (t != null)
- t.setTermText(removeAccents(t.termText()));
- return t;
- }
+ private char[] output = new char[256];
+ private int outputPos;
- /**
- * To replace accented characters in a String by unaccented equivalents.
- */
- public final static String removeAccents(String input) {
- final StringBuffer output = new StringBuffer();
- for (int i = 0; i < input.length(); i++) {
- switch (input.charAt(i)) {
- case '\u00C0' : // À
- case '\u00C1' : // Á
- case '\u00C2' : // Â
- case '\u00C3' : // Ã
- case '\u00C4' : // Ä
- case '\u00C5' : // Å
- output.append("A");
- break;
- case '\u00C6' : // Æ
- output.append("AE");
- break;
- case '\u00C7' : // Ç
- output.append("C");
- break;
- case '\u00C8' : // È
- case '\u00C9' : // É
- case '\u00CA' : // Ê
- case '\u00CB' : // Ë
- output.append("E");
- break;
- case '\u00CC' : // Ì
- case '\u00CD' : // Í
- case '\u00CE' : // Î
- case '\u00CF' : // Ï
- output.append("I");
- break;
- case '\u00D0' : // Ð
- output.append("D");
- break;
- case '\u00D1' : // Ñ
- output.append("N");
- break;
- case '\u00D2' : // Ò
- case '\u00D3' : // Ó
- case '\u00D4' : // Ô
- case '\u00D5' : // Õ
- case '\u00D6' : // Ö
- case '\u00D8' : // Ø
- output.append("O");
- break;
- case '\u0152' : // Œ
- output.append("OE");
- break;
- case '\u00DE' : // Þ
- output.append("TH");
- break;
- case '\u00D9' : // Ù
- case '\u00DA' : // Ú
- case '\u00DB' : // Û
- case '\u00DC' : // Ü
- output.append("U");
- break;
- case '\u00DD' : // Ý
- case '\u0178' : // Ÿ
- output.append("Y");
- break;
- case '\u00E0' : // à
- case '\u00E1' : // á
- case '\u00E2' : // â
- case '\u00E3' : // ã
- case '\u00E4' : // ä
- case '\u00E5' : // å
- output.append("a");
- break;
- case '\u00E6' : // æ
- output.append("ae");
- break;
- case '\u00E7' : // ç
- output.append("c");
- break;
- case '\u00E8' : // è
- case '\u00E9' : // é
- case '\u00EA' : // ê
- case '\u00EB' : // ë
- output.append("e");
- break;
- case '\u00EC' : // ì
- case '\u00ED' : // í
- case '\u00EE' : // î
- case '\u00EF' : // ï
- output.append("i");
- break;
- case '\u00F0' : // ð
- output.append("d");
- break;
- case '\u00F1' : // ñ
- output.append("n");
- break;
- case '\u00F2' : // ò
- case '\u00F3' : // ó
- case '\u00F4' : // ô
- case '\u00F5' : // õ
- case '\u00F6' : // ö
- case '\u00F8' : // ø
- output.append("o");
- break;
- case '\u0153' : // œ
- output.append("oe");
- break;
- case '\u00DF' : // ß
- output.append("ss");
- break;
- case '\u00FE' : // þ
- output.append("th");
- break;
- case '\u00F9' : // ù
- case '\u00FA' : // ú
- case '\u00FB' : // û
- case '\u00FC' : // ü
- output.append("u");
- break;
- case '\u00FD' : // ý
- case '\u00FF' : // ÿ
- output.append("y");
- break;
- default :
- output.append(input.charAt(i));
- break;
- }
- }
- return output.toString();
- }
-}
\ No newline at end of file
+ public final Token next(Token result) throws java.io.IOException {
+ result = input.next(result);
+ if (result != null) {
+ outputPos = 0;
+ removeAccents(result.termBuffer(), result.termLength());
+ result.setTermBuffer(output, 0, outputPos);
+ return result;
+ } else
+ return null;
+ }
+
+ private final void addChar(char c) {
+ if (outputPos == output.length) {
+ char[] newArray = new char[2*output.length];
+ System.arraycopy(output, 0, newArray, 0, output.length);
+ }
+ output[outputPos++] = c;
+ }
+
+ /**
+ * To replace accented characters in a String by unaccented equivalents.
+ */
+ public final void removeAccents(char[] input, int length) {
+ int pos = 0;
+ for (int i=0; i