Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 675655) +++ src/test/org/apache/lucene/analysis/TestToken.java (working copy) @@ -26,6 +26,119 @@ super(name); } + public void testCtor() throws Exception { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + char[] buf = t.termBuffer(); + assertNotSame(t.termBuffer(), content); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22, 7); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22)", t.toString()); + assertEquals(7, t.getFlags()); + + t = new Token(6, 22, "junk"); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + assertEquals("(hello,6,22,type=junk)", t.toString()); + assertEquals(0, t.getFlags()); + } + + public void testResize() { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + for (int i = 0; i < 2000; i++) + { + t.resizeTermBuffer(i); + assertTrue(i <= t.termBuffer().length); + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); + } + } + + public void testGrow() { + Token t = new Token(); + StringBuffer buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + char[] content = buf.toString().toCharArray(); + t.setTermBuffer(content, 0, content.length); + assertEquals(buf.length(), t.termLength()); + assertEquals(buf.toString(), t.term()); + buf.append(buf.toString()); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, first variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content, 0, content.length()); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, second variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + } + public void testToString() throws Exception { char[] b = {'a', 'l', 'o', 'h', 'a'}; Token t = new Token("", 0, 5); @@ -53,4 +166,13 @@ buffer[1] = 'o'; assertEquals(t.termText(), "hollo3"); } + + public void testClone() throws Exception { + Token t = new Token(0, 5); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, 5); + char[] buf = t.termBuffer(); + Token copy = (Token) t.clone(); + assertNotSame(buf, copy.termBuffer()); + } } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 675655) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -25,7 +25,7 @@ This is an abstract class.
NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 675655) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -20,7 +20,7 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc -/** A Token is an occurence of a term from the text of a field. It consists of +/** A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.
@@ -49,7 +49,7 @@
NOTE: As of 2.3, Token stores the term text internally as a malleable char[] termBuffer instead of String termText. The indexing code and core tokenizers - have been changed re-use a single Token instance, changing + have been changed to re-use a single Token instance, changing its buffer and other fields in-place as the Token is processed. This provides substantially better indexing performance as it saves the GC cost of new'ing a Token and @@ -62,14 +62,57 @@ instance when possible for best performance, by implementing the {@link TokenStream#next(Token)} API. Failing that, to create a new Token you should first use - one of the constructors that starts with null text. Then - you should call either {@link #termBuffer()} or {@link - #resizeTermBuffer(int)} to retrieve the Token's - termBuffer. Fill in the characters of your term into this - buffer, and finally call {@link #setTermLength(int)} to + one of the constructors that starts with null text. To load + the token from a char[] use {@link #setTermBuffer(char[], int, int)}. + To load from a String use {@link #setTermBuffer(String)}. + Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + if you know that your text is shorter than the capacity of the termBuffer + or {@link #resizeTermBuffer(int)}, if there is any possibility + that you may need to grow the buffer. Fill in the characters of your term into this + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to set the length of the term text. See LUCENE-969 for details.
+Typical reuse patterns: +
+ // prepare the token for re-use + reusableToken.clear(); + reusableToken.setTermBuffer(string); ++
+ // prepare the token for re-use + reusableToken.clear(); + reusableToken.setTermBuffer(string, 0, string.length() - 1); ++
+ // prepare the token for re-use + reusableToken.clear(); + reusableToken.setTermBuffer(buffer, 0, buffer.length); ++
+ // prepare the token for re-use + reusableToken.clear(); + reusableToken.setTermBuffer(buffer, start, end - start); ++
+ // prepare the token for re-use + reusableToken.clear(); + reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength()); ++
This is an abstract class. NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}. */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ Index: src/java/org/apache/lucene/analysis/TokenStream.java =================================================================== --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 675655) +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) @@ -32,13 +32,13 @@ whose input is another TokenStream. NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + #next(Token)} or {@link #next()}. They should override {@link #next(Token)}. */ public abstract class TokenStream { /** Returns the next token in the stream, or null at EOS. - * The returned Token is a "full private copy" (not + * @deprecated The returned Token is a "full private copy" (not * re-used across calls to next()) but will be slower * than calling {@link #next(Token)} instead.. */ public Token next() throws IOException {