Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 561926) +++ CHANGES.txt (working copy) @@ -18,10 +18,15 @@ 2. LUCENE-944: Remove deprecated methods setUseScorer14() and getUseScorer14() from BooleanQuery. (Paul Elschot via Michael Busch) - 5. LUCENE-963: Add setters to Field to allow for re-using a single + 3. LUCENE-963: Add setters to Field to allow for re-using a single Field instance during indexing. This is a sizable performance gain, especially for small documents. (Mike McCandless) + 4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to + permit re-using of Token and TokenStream instances during + indexing. This gives faster indexing performance. (Mike + McCandless) + Bug fixes 1. LUCENE-933: QueryParser fixed to not produce empty sub @@ -97,6 +102,10 @@ 6. LUCENE-939: Check explicitly for boundary conditions in FieldInfos and don't rely on exceptions. (Michael Busch) + 7. LUCENE-969: Changed core tokenizers & filters to re-use Token and + TokenStream instances when possible to improve indexing + performance. (Mike McCandless) + Documentation Build Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestToken.java (revision 0) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import junit.framework.*; + +public class TestToken extends TestCase { + + public TestToken(String name) { + super(name); + } + + public void testToString() throws Exception { + char[] b = {'a', 'l', 'o', 'h', 'a'}; + Token t = new Token("", 0, 5); + t.setTermBuffer(b, 0, 5); + assertEquals("(aloha,0,5)", t.toString()); + + t.setTermText("hi there"); + assertEquals("(hi there,0,5)", t.toString()); + } + + public void testMixedStringArray() throws Exception { + Token t = new Token("hello", 0, 5); + assertEquals(t.termText(), "hello"); + assertEquals(t.termLength(), 5); + assertEquals(new String(t.termBuffer(), 0, 5), "hello"); + t.setTermText("hello2"); + assertEquals(t.termLength(), 6); + assertEquals(new String(t.termBuffer(), 0, 6), "hello2"); + t.setTermBuffer("hello3".toCharArray(), 0, 6); + assertEquals(t.termText(), "hello3"); + + // Make sure if we get the buffer and change a character + // that termText() reflects the change + char[] buffer = t.termBuffer(); + buffer[1] = 'o'; + assertEquals(t.termText(), "hollo3"); + } +} Property changes on: src/test/org/apache/lucene/analysis/TestToken.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 561926) +++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy) @@ -94,7 +94,7 @@ Token token; while ((token = stream.next()) != null) { assertTrue(count < tokens.length); - assertEquals(tokens[count], token.termText); + assertEquals(tokens[count], token.termText()); count++; } Index: src/java/org/apache/lucene/analysis/SimpleAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (revision 561926) +++ src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.Reader; +import java.io.IOException; /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ @@ -25,4 +26,14 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new LowerCaseTokenizer(reader); } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new LowerCaseTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } } Index: src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java =================================================================== --- src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (revision 561926) +++ src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (working copy) @@ -75,6 +75,14 @@ return analyzer.tokenStream(fieldName, reader); } + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); + if (analyzer == null) + analyzer = defaultAnalyzer; + + return analyzer.reusableTokenStream(fieldName, reader); + } + /** Return the positionIncrementGap from the analyzer assigned to fieldName */ public int getPositionIncrementGap(String fieldName) { Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); Index: src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (revision 561926) +++ src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.Reader; +import java.io.IOException; /** An Analyzer that uses WhitespaceTokenizer. */ @@ -25,4 +26,14 @@ public TokenStream tokenStream(String fieldName, Reader reader) { return new WhitespaceTokenizer(reader); } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new WhitespaceTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 561926) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -28,8 +28,7 @@ private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; - private static final int IO_BUFFER_SIZE = 1024; - private final char[] buffer = new char[MAX_WORD_LEN]; + private static final int IO_BUFFER_SIZE = 4096; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** Returns true iff a character should be included in a token. This @@ -45,31 +44,32 @@ return c; } - /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { int length = 0; - int start = offset; + int start = bufferIndex; + char[] buffer = token.termBuffer(); while (true) { - final char c; - offset++; if (bufferIndex >= dataLen) { + offset += dataLen; dataLen = input.read(ioBuffer); + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } bufferIndex = 0; } - ; - if (dataLen == -1) { - if (length > 0) - break; - else - return null; - } else - c = ioBuffer[bufferIndex++]; + final char c = ioBuffer[bufferIndex++]; + if (isTokenChar(c)) { // if it's a token char if (length == 0) // start of token - start = offset - 1; + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = token.resizeTermBuffer(1+length); buffer[length++] = normalize(c); // buffer it, normalized @@ -78,9 +78,18 @@ } else if (length > 0) // at non-Letter w/ chars break; // return 'em - } - return new Token(new String(buffer, 0, length), start, start + length); + token.termLength = length; + token.startOffset = start; + token.endOffset = start+length; + return token; } + + public void reset(Reader input) throws IOException { + super.reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; + } } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 561926) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -23,6 +23,8 @@ /** A Tokenizer is a TokenStream whose input is a Reader.
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class Tokenizer extends TokenStream {
@@ -41,5 +43,13 @@
public void close() throws IOException {
input.close();
}
+
+ /** Reset the tokenizer to a new reader. An analyzer may
+ * use this method to re-use a single TokenStream rather
+ * than creating a new TokenStream for every field of
+ * every document. */
+ protected void reset(Reader input) throws IOException {
+ this.input = input;
+ }
}
Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 561926)
+++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
@@ -45,16 +45,13 @@
stemmer = new PorterStemmer();
}
- /** Returns the next input Token, after being stemmed */
- public final Token next() throws IOException {
- Token token = input.next();
- if (token == null)
+ public final Token next(Token result) throws IOException {
+ result = input.next(result);
+ if (result != null) {
+ if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+ result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
+ return result;
+ } else
return null;
- else {
- String s = stemmer.stem(token.termText);
- if (s != token.termText) // Yes, I mean object reference comparison here
- token.termText = s;
- return token;
- }
}
}
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 561926)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -28,7 +28,6 @@
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
- private final char[] buffer;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
@@ -36,23 +35,23 @@
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
- this.buffer = new char[bufferSize];
this.done = false;
}
- public Token next() throws IOException {
+ public Token next(Token result) throws IOException {
if (!done) {
done = true;
- StringBuffer buffer = new StringBuffer();
- int length;
+ int upto = 0;
+ char[] buffer = result.termBuffer();
while (true) {
- length = input.read(this.buffer);
+ final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
-
- buffer.append(this.buffer, 0, length);
+ upto += length;
+ if (upto == buffer.length)
+ buffer = result.resizeTermBuffer(1+buffer.length);
}
- String text = buffer.toString();
- return new Token(text, 0, text.length());
+ result.termLength = upto;
+ return result;
}
return null;
}
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 561926)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -1,8 +1,5 @@
package org.apache.lucene.analysis;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.TermPositions;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -20,6 +17,9 @@
* limitations under the License.
*/
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.TermPositions;
+
/** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -44,66 +44,109 @@
The APIs introduced here might change in the future and will not be
supported anymore in such a case.
+
+
+
NOTE: As of 2.3, Token stores the term text + internally as a malleable char[] termBuffer instead of + String termText. The indexing code and core tokenizers + have been changed re-use a single Token instance, changing + its buffer and other fields in-place as the Token is + processed. This provides substantially better indexing + performance as it saves the GC cost of new'ing a Token and + String for every term. The APIs that accept String + termText are still available but a warning about the + associated performance cost has been added (below). The + {@link #termText()} method has been deprecated.
+ +Tokenizers and filters should try to re-use a Token + instance when possible for best performance. Failing + that, to create a new Token you should first use one of + the constructors that starts with null text. Then you + should call either {@link #termBuffer()} or {@link + #resizeTermBuffer(int)} to retrieve the Token's + termBuffer. Fill in the characters of your term into this + buffer, and finally call {@link #setTermLength(int)} to + set the length of the term text. See LUCENE-969 + for details.
+ @see org.apache.lucene.index.Payload - */ - // TODO: Remove warning after API has been finalized +*/ + +// TODO: Remove warning after API has been finalized + public class Token implements Cloneable { - String termText; // the text of the term + + private static final String DEFAULT_TYPE = "word"; + private static int MIN_BUFFER_SIZE = 10; + + /** @deprecated: we will remove this when we remove the + * deprecated APIs */ + private String termText; + + char[] termBuffer; // characters for the term text + int termLength; // length of term text in buffer + int startOffset; // start in source text int endOffset; // end in source text - String type = "word"; // lexical type + String type = DEFAULT_TYPE; // lexical type Payload payload; - // For better indexing speed, use termBuffer (and - // termBufferOffset/termBufferLength) instead of termText - // to save new'ing a String per token - char[] termBuffer; - int termBufferOffset; - int termBufferLength; + int positionIncrement = 1; - private int positionIncrement = 1; + /** Constructs a Token will null text. */ + public Token() { + } - /** Constructs a Token with the given term text, and start & end offsets. - The type defaults to "word." */ - public Token(String text, int start, int end) { - termText = text; + /** Constructs a Token with null text and start & end + * offsets. + * @param start start offset + * @param end end offset */ + public Token(int start, int end) { startOffset = start; endOffset = end; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end offsets. - * The type defaults to "word." */ - public Token(char[] text, int offset, int length, int start, int end) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; + /** Constructs a Token with null text and start & end + * offsets plus the Token type. + * @param start start offset + * @param end end offset */ + public Token(int start, int end, String typ) { startOffset = start; endOffset = end; + type = typ; } - /** Constructs a Token with the given text, start and end offsets, & type. */ - public Token(String text, int start, int end, String typ) { + /** Constructs a Token with the given term text, and start + * & end offsets. The type defaults to "word." + * NOTE: for better indexing speed you should + * instead use the char[] termBuffer methods to set the + * term text. + * @param text term text + * @param start start offset + * @param end end offset */ + public Token(String text, int start, int end) { termText = text; startOffset = start; endOffset = end; - type = typ; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end - * offsets, & type. */ - public Token(char[] text, int offset, int length, int start, int end, String typ) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; + /** Constructs a Token with the given text, start and end + * offsets, & type. NOTE: for better indexing + * speed you should instead use the char[] termBuffer + * methods to set the term text. + * @param text term text + * @param start start offset + * @param end end offset + * @param typ token type */ + public Token(String text, int start, int end, String typ) { + termText = text; startOffset = start; endOffset = end; type = typ; } - /** Set the position increment. This determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. @@ -139,71 +182,175 @@ /** Returns the position increment of this Token. * @see #setPositionIncrement */ - public int getPositionIncrement() { return positionIncrement; } + public int getPositionIncrement() { + return positionIncrement; + } - /** Sets the Token's term text. */ + /** Sets the Token's term text. NOTE: for better + * indexing speed you should instead use the char[] + * termBuffer methods to set the term text. */ public void setTermText(String text) { termText = text; + termBuffer = null; } - /** Returns the Token's term text. */ - public final String termText() { return termText; } - public final char[] termBuffer() { return termBuffer; } - public final int termBufferOffset() { return termBufferOffset; } - public final int termBufferLength() { return termBufferLength; } + /** Returns the Token's term text. + * + * @deprecated Use {@link #termBuffer()} and {@link + * #termLength()} instead. */ + public final String termText() { + if (termText == null && termBuffer != null) + termText = new String(termBuffer, 0, termLength); + return termText; + } - public void setStartOffset(int offset) {this.startOffset = offset;} - public void setEndOffset(int offset) {this.endOffset = offset;} - + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer + * array. NOTE: for better indexing speed you + * should instead retrieve the termBuffer, using {@link + * #termBuffer()} or {@link #resizeTermBuffer(int)}, and + * fill it in directly to set the term text.*/ public final void setTermBuffer(char[] buffer, int offset, int length) { - this.termBuffer = buffer; - this.termBufferOffset = offset; - this.termBufferLength = length; + resizeTermBuffer(length); + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; } - + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeTermBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setTermLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public final char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + initTermBuffer(); + if (newSize > termBuffer.length) { + int size = termBuffer.length; + while(size < newSize) + size *= 2; + char[] newBuffer = new char[size]; + System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); + termBuffer = newBuffer; + } + return termBuffer; + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + if (termText == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } else { + int length = termText.length(); + if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE; + termBuffer = new char[length]; + termLength = termText.length(); + termText.getChars(0, termText.length(), termBuffer, 0); + termText = null; + } + } else if (termText != null) + termText = null; + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public final int termLength() { + initTermBuffer(); + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. */ + public final void setTermLength(int length) { + initTermBuffer(); + termLength = length; + } + /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ - public final int startOffset() { return startOffset; } + public final int startOffset() { + return startOffset; + } + /** Set the starting offset. + @see #startOffset() */ + public void setStartOffset(int offset) { + this.startOffset = offset; + } + /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. */ - public final int endOffset() { return endOffset; } + public final int endOffset() { + return endOffset; + } + /** Set the ending offset. + @see #endOffset() */ + public void setEndOffset(int offset) { + this.endOffset = offset; + } + /** Returns this Token's lexical type. Defaults to "word". */ - public final String type() { return type; } + public final String type() { + return type; + } + /** Set the lexical type. + @see #type() */ + public final void setType(String type) { + this.type = type; + } + /** - * Sets this Token's payload. + * Returns this Token's payload. ** WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ // TODO: Remove warning after API has been finalized - public void setPayload(Payload payload) { - this.payload = payload; + public Payload getPayload() { + return this.payload; } - + /** - * Returns this Token's payload. + * Sets this Token's payload. *
* WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ // TODO: Remove warning after API has been finalized - public Payload getPayload() { - return this.payload; + public void setPayload(Payload payload) { + this.payload = payload; } - + public String toString() { StringBuffer sb = new StringBuffer(); - sb.append("(" + termText + "," + startOffset + "," + endOffset); + sb.append("("); + initTermBuffer(); + if (termBuffer == null) + sb.append("null"); + else + sb.append(termBuffer, 0, termLength); + sb.append("," + startOffset + "," + endOffset); if (!type.equals("word")) sb.append(",type="+type); if (positionIncrement != 1) @@ -212,11 +359,15 @@ return sb.toString(); } - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen since we implement Cloneable - } + /** Reset all state for this token back to defaults (same + * state that new Token() creates). */ + public void clear() { + payload = null; + termBuffer = null; + termLength = 0; + termText = null; + positionIncrement = 1; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; } } Index: src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (revision 561926) +++ src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (working copy) @@ -69,10 +69,34 @@ /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); - result = new StopFilter(result, stopSet); - return result; + TokenStream tokenStream = new StandardTokenizer(reader); + tokenStream = new StandardFilter(tokenStream); + tokenStream = new LowerCaseFilter(tokenStream); + tokenStream = new StopFilter(tokenStream, stopSet); + return tokenStream; } + + private class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + FastCharStream charStream; + }; + public TokenStream reusableTokenStream(String fieldName, Reader reader) { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.charStream = new FastCharStream(reader); + streams.tokenStream = new StandardTokenizer(streams.charStream); + streams.tokenStream.setInput(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else { + streams.charStream.reset(reader); + streams.tokenStream.ReInit(streams.charStream); + } + + return streams.filteredTokenStream; + } } Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 561926) +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -37,33 +37,32 @@ *
Removes 's from the end of words. *
Removes dots from acronyms.
*/
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
- org.apache.lucene.analysis.Token t = input.next();
+ public final org.apache.lucene.analysis.Token next(org.apache.lucene.analysis.Token result) throws java.io.IOException {
+ org.apache.lucene.analysis.Token t = input.next(result);
if (t == null)
return null;
- String text = t.termText();
- String type = t.type();
+ char[] buffer = t.termBuffer();
+ final int bufferLength = t.termLength();
+ final String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's
- (text.endsWith("'s") || text.endsWith("'S"))) {
- return new org.apache.lucene.analysis.Token
- (text.substring(0,text.length()-2),
- t.startOffset(), t.endOffset(), type);
-
+ bufferLength >= 2 &&
+ buffer[bufferLength-2] == '\'' &&
+ (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+ // Strip last 2 characters off
+ t.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
- StringBuffer trimmed = new StringBuffer();
- for (int i = 0; i < text.length(); i++) {
- char c = text.charAt(i);
- if (c != '.')
- trimmed.append(c);
+ int upto = 0;
+ for(int i=0;i
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 561926)
+++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy)
@@ -25,144 +25,165 @@
*
*/
public class ISOLatin1AccentFilter extends TokenFilter {
- public ISOLatin1AccentFilter(TokenStream input) {
- super(input);
- }
+ public ISOLatin1AccentFilter(TokenStream input) {
+ super(input);
+ }
- public final Token next() throws java.io.IOException {
- final Token t = input.next();
- if (t != null)
- t.setTermText(removeAccents(t.termText()));
- return t;
- }
+ private char[] output = new char[256];
+ private int outputPos;
- /**
- * To replace accented characters in a String by unaccented equivalents.
- */
- public final static String removeAccents(String input) {
- final StringBuffer output = new StringBuffer();
- for (int i = 0; i < input.length(); i++) {
- switch (input.charAt(i)) {
- case '\u00C0' : // À
- case '\u00C1' : // Á
- case '\u00C2' : // Â
- case '\u00C3' : // Ã
- case '\u00C4' : // Ä
- case '\u00C5' : // Å
- output.append("A");
- break;
- case '\u00C6' : // Æ
- output.append("AE");
- break;
- case '\u00C7' : // Ç
- output.append("C");
- break;
- case '\u00C8' : // È
- case '\u00C9' : // É
- case '\u00CA' : // Ê
- case '\u00CB' : // Ë
- output.append("E");
- break;
- case '\u00CC' : // Ì
- case '\u00CD' : // Í
- case '\u00CE' : // Î
- case '\u00CF' : // Ï
- output.append("I");
- break;
- case '\u00D0' : // Ð
- output.append("D");
- break;
- case '\u00D1' : // Ñ
- output.append("N");
- break;
- case '\u00D2' : // Ò
- case '\u00D3' : // Ó
- case '\u00D4' : // Ô
- case '\u00D5' : // Õ
- case '\u00D6' : // Ö
- case '\u00D8' : // Ø
- output.append("O");
- break;
- case '\u0152' : // Œ
- output.append("OE");
- break;
- case '\u00DE' : // Þ
- output.append("TH");
- break;
- case '\u00D9' : // Ù
- case '\u00DA' : // Ú
- case '\u00DB' : // Û
- case '\u00DC' : // Ü
- output.append("U");
- break;
- case '\u00DD' : // Ý
- case '\u0178' : // Ÿ
- output.append("Y");
- break;
- case '\u00E0' : // à
- case '\u00E1' : // á
- case '\u00E2' : // â
- case '\u00E3' : // ã
- case '\u00E4' : // ä
- case '\u00E5' : // å
- output.append("a");
- break;
- case '\u00E6' : // æ
- output.append("ae");
- break;
- case '\u00E7' : // ç
- output.append("c");
- break;
- case '\u00E8' : // è
- case '\u00E9' : // é
- case '\u00EA' : // ê
- case '\u00EB' : // ë
- output.append("e");
- break;
- case '\u00EC' : // ì
- case '\u00ED' : // í
- case '\u00EE' : // î
- case '\u00EF' : // ï
- output.append("i");
- break;
- case '\u00F0' : // ð
- output.append("d");
- break;
- case '\u00F1' : // ñ
- output.append("n");
- break;
- case '\u00F2' : // ò
- case '\u00F3' : // ó
- case '\u00F4' : // ô
- case '\u00F5' : // õ
- case '\u00F6' : // ö
- case '\u00F8' : // ø
- output.append("o");
- break;
- case '\u0153' : // œ
- output.append("oe");
- break;
- case '\u00DF' : // ß
- output.append("ss");
- break;
- case '\u00FE' : // þ
- output.append("th");
- break;
- case '\u00F9' : // ù
- case '\u00FA' : // ú
- case '\u00FB' : // û
- case '\u00FC' : // ü
- output.append("u");
- break;
- case '\u00FD' : // ý
- case '\u00FF' : // ÿ
- output.append("y");
- break;
- default :
- output.append(input.charAt(i));
- break;
- }
- }
- return output.toString();
- }
-}
\ No newline at end of file
+ public final Token next(Token result) throws java.io.IOException {
+ result = input.next(result);
+ if (result != null) {
+ outputPos = 0;
+ removeAccents(result.termBuffer(), result.termLength());
+ result.setTermBuffer(output, 0, outputPos);
+ return result;
+ } else
+ return null;
+ }
+
+ private final void addChar(char c) {
+ if (outputPos == output.length) {
+ char[] newArray = new char[2*output.length];
+ System.arraycopy(output, 0, newArray, 0, output.length);
+ }
+ output[outputPos++] = c;
+ }
+
+ /**
+ * To replace accented characters in a String by unaccented equivalents.
+ */
+ public final void removeAccents(char[] input, int length) {
+ int pos = 0;
+ for (int i=0; i