Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 675655) +++ src/test/org/apache/lucene/analysis/TestToken.java (working copy) @@ -26,6 +26,119 @@ super(name); } + public void testCtor() throws Exception { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + char[] buf = t.termBuffer(); + assertNotSame(t.termBuffer(), content); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22, 7); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals(7, t.getFlags()); + + t = new Token(6, 22, "junk"); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22,type=junk)", t.toString()); + assertEquals(0, t.getFlags()); + } + + public void testResize() { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + for (int i = 0; i < 2000; i++) + { + t.resizeTermBuffer(i); + assertTrue(i <= t.termBuffer().length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + } + } + + public void testGrow() { + Token t = new Token(); + StringBuffer buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + char[] content = buf.toString().toCharArray(); + t.setTermBuffer(content, 0, content.length); + assertEquals(buf.length(), t.termLength()); + assertEquals(buf.toString(), t.term()); + buf.append(buf.toString()); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, first variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content, 0, content.length()); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, second variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + } + public void testToString() throws Exception { char[] b = {'a', 'l', 'o', 'h', 'a'}; Token t = new Token("", 0, 5); @@ -53,4 +166,13 @@ buffer[1] = 'o'; assertEquals(t.termText(), "hollo3"); } + + public void testClone() throws Exception { + Token t = new Token(0, 5); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, 5); + char[] buf = t.termBuffer(); + Token copy = (Token) t.clone(); + assertNotSame(buf, copy.termBuffer()); + } } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 675655) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -25,7 +25,7 @@ This is an abstract class.

NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.

NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 675655) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -20,7 +20,7 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc -/** A Token is an occurence of a term from the text of a field. It consists of +/** A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.

@@ -49,7 +49,7 @@

NOTE: As of 2.3, Token stores the term text internally as a malleable char[] termBuffer instead of String termText. The indexing code and core tokenizers - have been changed re-use a single Token instance, changing + have been changed to re-use a single Token instance, changing its buffer and other fields in-place as the Token is processed. This provides substantially better indexing performance as it saves the GC cost of new'ing a Token and @@ -62,14 +62,57 @@ instance when possible for best performance, by implementing the {@link TokenStream#next(Token)} API. Failing that, to create a new Token you should first use - one of the constructors that starts with null text. Then - you should call either {@link #termBuffer()} or {@link - #resizeTermBuffer(int)} to retrieve the Token's - termBuffer. Fill in the characters of your term into this - buffer, and finally call {@link #setTermLength(int)} to + one of the constructors that starts with null text. To load + the token from a char[] use {@link #setTermBuffer(char[], int, int)}. + To load from a String use {@link #setTermBuffer(String)}. + Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + if you know that your text is shorter than the capacity of the termBuffer + or {@link #resizeTermBuffer(int)}, if there is any possibility + that you may need to grow the buffer. Fill in the characters of your term into this + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to set the length of the term text. See LUCENE-969 for details.

+

Typical reuse patterns: +

+

@see org.apache.lucene.index.Payload */ @@ -138,7 +181,9 @@ * term text. * @param text term text * @param start start offset - * @param end end offset */ + * @param end end offset + * @deprecated + */ public Token(String text, int start, int end) { termText = text; startOffset = start; @@ -152,7 +197,9 @@ * @param text term text * @param start start offset * @param end end offset - * @param typ token type */ + * @param typ token type + * @deprecated + */ public Token(String text, int start, int end, String typ) { termText = text; startOffset = start; @@ -169,6 +216,7 @@ * @param start * @param end * @param flags token type bits + * @deprecated */ public Token(String text, int start, int end, int flags) { termText = text; @@ -218,7 +266,11 @@ /** Sets the Token's term text. NOTE: for better * indexing speed you should instead use the char[] - * termBuffer methods to set the term text. */ + * termBuffer methods to set the term text. + * @deprecated use {@link #setTermBuffer(char[], int, length)} or + * {@link #setTermBuffer(String)} or + * {@link #setTermBuffer(String, int, int)}. + */ public void setTermText(String text) { termText = text; termBuffer = null; @@ -230,7 +282,7 @@ * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a - * String, use new String(token.termBuffer(), 0, token.termLength()) + * String, use {@link #term()} */ public final String termText() { if (termText == null && termBuffer != null) @@ -238,19 +290,68 @@ return termText; } + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public final String term() { + if (termText != null) + return termText; + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer - * array. NOTE: for better indexing speed you - * should instead retrieve the termBuffer, using {@link - * #termBuffer()} or {@link #resizeTermBuffer(int)}, and - * fill it in directly to set the term text. This saves - * an extra copy. */ + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ public final void setTermBuffer(char[] buffer, int offset, int length) { - resizeTermBuffer(length); + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } System.arraycopy(buffer, offset, termBuffer, 0, length); termLength = length; } + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public final void setTermBuffer(String buffer) { + termText = null; + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public final void setTermBuffer(String buffer, int offset, int length) { + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link @@ -263,23 +364,81 @@ return termBuffer; } - /** Grows the termBuffer to at least size newSize. + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize) { - initTermBuffer(); - if (newSize > termBuffer.length) { - int size = termBuffer.length; - while(size < newSize) - size *= 2; - char[] newBuffer = new char[size]; - System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); - termBuffer = newBuffer; + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + if (termText != null) { + termText.getChars(0, termText.length(), newCharBuffer, 0); + } + termBuffer = newCharBuffer; } + else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + termText = null; return termBuffer; } + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) + { + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + // If there is already a termText, then the size has to be at least that big + if (termText != null) { + int ttLength = termText.length(); + if (newSize < ttLength) { + newSize = ttLength; + } + } + + // if the buffer exists and is too small, then determine a better size. + // this is the current doubling algorithm. it could be better. + int tbLength = termBuffer == null ? 0 : termBuffer.length; + + if (tbLength > 0 && newSize > tbLength) { + /* A simple allocation based on the size of the request + * is O(n**2). Using over-allocation will typically be O(n). + * Previously, this used a doubling algorithm, which + * was to aggressive in growth. This O(n) algorithm makes + * modest room for additional growth. + * The growth pattern is: + * MIN_BUFFER_SIZE, 18, 27, 37, 48, 61, 75, 91, 109, 129, 152, 178, 207, ... + */ + newSize = (newSize >> 3) + 6 + newSize; + } + + // Check to see if the buffer needs to be resized + if (newSize > tbLength) + { + return new char[newSize]; + } + + return null; + } + // TODO: once we remove the deprecated termText() method // and switch entirely to char[] termBuffer we don't need // to use this method anymore @@ -308,10 +467,16 @@ } /** Set number of valid characters (length of the term) in - * the termBuffer array. */ + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ public final void setTermLength(int length) { initTermBuffer(); - termLength = length; + if (length <= termBuffer.length) + termLength = length; } /** Returns this Token's starting offset, the position of the first character @@ -424,9 +589,9 @@ public Object clone() { try { Token t = (Token)super.clone(); + // Do a deep clone if (termBuffer != null) { - t.termBuffer = null; - t.setTermBuffer(termBuffer, 0, termLength); + t.termBuffer = (char[]) termBuffer.clone(); } if (payload != null) { t.setPayload((Payload) payload.clone()); Index: src/java/org/apache/lucene/analysis/TokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 675655) +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) @@ -23,7 +23,7 @@

This is an abstract class. NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}. */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 675655) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -32,13 +32,13 @@ whose input is another TokenStream. NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next(Token)} or {@link #next()}. They should override {@link #next(Token)}. */ public abstract class TokenStream { /** Returns the next token in the stream, or null at EOS. - * The returned Token is a "full private copy" (not + * @deprecated The returned Token is a "full private copy" (not * re-used across calls to next()) but will be slower * than calling {@link #next(Token)} instead.. */ public Token next() throws IOException {