Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 680796) +++ src/test/org/apache/lucene/analysis/TestToken.java (working copy) @@ -26,6 +26,119 @@ super(name); } + public void testCtor() throws Exception { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + char[] buf = t.termBuffer(); + assertNotSame(t.termBuffer(), content); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22, 7); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals(7, t.getFlags()); + + t = new Token(6, 22, "junk"); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22,type=junk)", t.toString()); + assertEquals(0, t.getFlags()); + } + + public void testResize() { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + for (int i = 0; i < 2000; i++) + { + t.resizeTermBuffer(i); + assertTrue(i <= t.termBuffer().length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + } + } + + public void testGrow() { + Token t = new Token(); + StringBuffer buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + char[] content = buf.toString().toCharArray(); + t.setTermBuffer(content, 0, content.length); + assertEquals(buf.length(), t.termLength()); + assertEquals(buf.toString(), t.term()); + buf.append(buf.toString()); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, first variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content, 0, content.length()); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, second variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + } + public void testToString() throws Exception { char[] b = {'a', 'l', 'o', 'h', 'a'}; Token t = new Token("", 0, 5); @@ -53,4 +166,13 @@ buffer[1] = 'o'; assertEquals(t.termText(), "hollo3"); } + + public void testClone() throws Exception { + Token t = new Token(0, 5); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, 5); + char[] buf = t.termBuffer(); + Token copy = (Token) t.clone(); + assertNotSame(buf, copy.termBuffer()); + } } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 680796) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -24,8 +24,9 @@

This is an abstract class.

- NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + NOTE: subclasses must override {@link #next(Token)}. It's + also OK to instead override {@link #next()} but that + method is now deprecated in favor of {@link #next(Token)}.

NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 680796) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -19,8 +19,9 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc +import org.apache.lucene.util.ArrayUtil; -/** A Token is an occurence of a term from the text of a field. It consists of +/** A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.

@@ -49,7 +50,7 @@

NOTE: As of 2.3, Token stores the term text internally as a malleable char[] termBuffer instead of String termText. The indexing code and core tokenizers - have been changed re-use a single Token instance, changing + have been changed to re-use a single Token instance, changing its buffer and other fields in-place as the Token is processed. This provides substantially better indexing performance as it saves the GC cost of new'ing a Token and @@ -62,14 +63,57 @@ instance when possible for best performance, by implementing the {@link TokenStream#next(Token)} API. Failing that, to create a new Token you should first use - one of the constructors that starts with null text. Then - you should call either {@link #termBuffer()} or {@link - #resizeTermBuffer(int)} to retrieve the Token's - termBuffer. Fill in the characters of your term into this - buffer, and finally call {@link #setTermLength(int)} to + one of the constructors that starts with null text. To load + the token from a char[] use {@link #setTermBuffer(char[], int, int)}. + To load from a String use {@link #setTermBuffer(String)}. + Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + if you know that your text is shorter than the capacity of the termBuffer + or {@link #resizeTermBuffer(int)}, if there is any possibility + that you may need to grow the buffer. Fill in the characters of your term into this + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to set the length of the term text. See LUCENE-969 for details.

+

Typical reuse patterns: +

+

@see org.apache.lucene.index.Payload */ @@ -138,7 +182,9 @@ * term text. * @param text term text * @param start start offset - * @param end end offset */ + * @param end end offset + * @deprecated + */ public Token(String text, int start, int end) { termText = text; startOffset = start; @@ -152,7 +198,9 @@ * @param text term text * @param start start offset * @param end end offset - * @param typ token type */ + * @param typ token type + * @deprecated + */ public Token(String text, int start, int end, String typ) { termText = text; startOffset = start; @@ -169,6 +217,7 @@ * @param start * @param end * @param flags token type bits + * @deprecated */ public Token(String text, int start, int end, int flags) { termText = text; @@ -218,7 +267,11 @@ /** Sets the Token's term text. NOTE: for better * indexing speed you should instead use the char[] - * termBuffer methods to set the term text. */ + * termBuffer methods to set the term text. + * @deprecated use {@link #setTermBuffer(char[], int, length)} or + * {@link #setTermBuffer(String)} or + * {@link #setTermBuffer(String, int, int)}. + */ public void setTermText(String text) { termText = text; termBuffer = null; @@ -230,7 +283,7 @@ * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a - * String, use new String(token.termBuffer(), 0, token.termLength()) + * String, use {@link #term()} */ public final String termText() { if (termText == null && termBuffer != null) @@ -238,19 +291,68 @@ return termText; } + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public final String term() { + if (termText != null) + return termText; + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer - * array. NOTE: for better indexing speed you - * should instead retrieve the termBuffer, using {@link - * #termBuffer()} or {@link #resizeTermBuffer(int)}, and - * fill it in directly to set the term text. This saves - * an extra copy. */ + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ public final void setTermBuffer(char[] buffer, int offset, int length) { - resizeTermBuffer(length); + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } System.arraycopy(buffer, offset, termBuffer, 0, length); termLength = length; } + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public final void setTermBuffer(String buffer) { + termText = null; + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public final void setTermBuffer(String buffer, int offset, int length) { + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link @@ -263,23 +365,69 @@ return termBuffer; } - /** Grows the termBuffer to at least size newSize. + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize) { - initTermBuffer(); - if (newSize > termBuffer.length) { - int size = termBuffer.length; - while(size < newSize) - size *= 2; - char[] newBuffer = new char[size]; - System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); - termBuffer = newBuffer; + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + assert newCharBuffer != null; + if (termText != null) { + termText.getChars(0, termText.length(), newCharBuffer, 0); + } + termBuffer = newCharBuffer; + } else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; } + termText = null; return termBuffer; } + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) { + if (termBuffer != null) { + if (termBuffer.length >= newSize) + // Already big enough + return null; + else + // Not big enough; create a new array with slight + // over allocation: + return new char[ArrayUtil.getNextSize(newSize)]; + } else { + + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + // If there is already a termText, then the size has to be at least that big + if (termText != null) { + int ttLength = termText.length(); + if (newSize < ttLength) { + newSize = ttLength; + } + } + + return new char[newSize]; + } + } + // TODO: once we remove the deprecated termText() method // and switch entirely to char[] termBuffer we don't need // to use this method anymore @@ -308,9 +456,16 @@ } /** Set number of valid characters (length of the term) in - * the termBuffer array. */ + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ public final void setTermLength(int length) { initTermBuffer(); + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); termLength = length; } @@ -424,9 +579,9 @@ public Object clone() { try { Token t = (Token)super.clone(); + // Do a deep clone if (termBuffer != null) { - t.termBuffer = null; - t.setTermBuffer(termBuffer, 0, termLength); + t.termBuffer = (char[]) termBuffer.clone(); } if (payload != null) { t.setPayload((Payload) payload.clone()); Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 680796) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -23,7 +23,7 @@

This is an abstract class. NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}. */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 680796) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -31,14 +31,15 @@

  • {@link TokenFilter}, a TokenStream whose input is another TokenStream. - NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + NOTE: subclasses must override {@link #next(Token)}. It's + also OK to instead override {@link #next()} but that + method is now deprecated in favor of {@link #next(Token)}. */ public abstract class TokenStream { /** Returns the next token in the stream, or null at EOS. - * The returned Token is a "full private copy" (not + * @deprecated The returned Token is a "full private copy" (not * re-used across calls to next()) but will be slower * than calling {@link #next(Token)} instead.. */ public Token next() throws IOException {