Index: lucene/src/test/org/apache/lucene/analysis/TestToken.java
===================================================================
--- lucene/src/test/org/apache/lucene/analysis/TestToken.java (revision 682416)
+++ lucene/src/test/org/apache/lucene/analysis/TestToken.java (working copy)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.*;
import org.apache.lucene.util.LuceneTestCase;
public class TestToken extends LuceneTestCase {
@@ -26,6 +25,119 @@
super(name);
}
+ public void testCtor() throws Exception {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ char[] buf = t.termBuffer();
+ assertNotSame(t.termBuffer(), content);
+ assertEquals("hello", t.term());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22, 7);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals(7, t.getFlags());
+
+ t = new Token(6, 22, "junk");
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22,type=junk)", t.toString());
+ assertEquals(0, t.getFlags());
+ }
+
+ public void testResize() {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ for (int i = 0; i < 2000; i++)
+ {
+ t.resizeTermBuffer(i);
+ assertTrue(i <= t.termBuffer().length);
+ assertEquals("hello", t.term());
+ }
+ }
+
+ public void testGrow() {
+ Token t = new Token();
+ StringBuffer buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ char[] content = buf.toString().toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals(buf.length(), t.termLength());
+ assertEquals(buf.toString(), t.term());
+ buf.append(buf.toString());
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, first variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content, 0, content.length());
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, second variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+ }
+
public void testToString() throws Exception {
char[] b = {'a', 'l', 'o', 'h', 'a'};
Token t = new Token("", 0, 5);
@@ -40,10 +152,10 @@
Token t = new Token("hello", 0, 5);
assertEquals(t.termText(), "hello");
assertEquals(t.termLength(), 5);
- assertEquals(new String(t.termBuffer(), 0, 5), "hello");
+ assertEquals(t.term(), "hello");
t.setTermText("hello2");
assertEquals(t.termLength(), 6);
- assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
+ assertEquals(t.term(), "hello2");
t.setTermBuffer("hello3".toCharArray(), 0, 6);
assertEquals(t.termText(), "hello3");
@@ -53,4 +165,13 @@
buffer[1] = 'o';
assertEquals(t.termText(), "hollo3");
}
+
+ public void testClone() throws Exception {
+ Token t = new Token(0, 5);
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, 5);
+ char[] buf = t.termBuffer();
+ Token copy = (Token) t.clone();
+ assertNotSame(buf, copy.termBuffer());
+ }
}
Index: lucene/src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/Tokenizer.java (revision 682416)
+++ lucene/src/java/org/apache/lucene/analysis/Tokenizer.java (working copy)
@@ -24,8 +24,9 @@
This is an abstract class.
- NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ NOTE: subclasses must override {@link #next(Token)}. It's
+ also OK to instead override {@link #next()} but that
+ method is now deprecated in favor of {@link #next(Token)}.
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
Index: lucene/src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/Token.java (revision 682416)
+++ lucene/src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -19,8 +19,9 @@
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
+import org.apache.lucene.util.ArrayUtil;
-/** A Token is an occurence of a term from the text of a field. It consists of
+/** A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -29,7 +30,7 @@
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
- The type is an interned string, assigned by a lexical analyzer
+ The type is a string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
@@ -49,7 +50,7 @@
NOTE: As of 2.3, Token stores the term text
internally as a malleable char[] termBuffer instead of
String termText. The indexing code and core tokenizers
- have been changed re-use a single Token instance, changing
+ have been changed to re-use a single Token instance, changing
its buffer and other fields in-place as the Token is
processed. This provides substantially better indexing
performance as it saves the GC cost of new'ing a Token and
@@ -62,14 +63,79 @@
instance when possible for best performance, by
implementing the {@link TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
- one of the constructors that starts with null text. Then
- you should call either {@link #termBuffer()} or {@link
- #resizeTermBuffer(int)} to retrieve the Token's
- termBuffer. Fill in the characters of your term into this
- buffer, and finally call {@link #setTermLength(int)} to
+ one of the constructors that starts with null text. To load
+ the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
+ To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}.
+ Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
+ if you know that your text is shorter than the capacity of the termBuffer
+ or {@link #resizeTermBuffer(int)}, if there is any possibility
+ that you may need to grow the buffer. Fill in the characters of your term into this
+ buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
+ or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
set the length of the term text. See LUCENE-969
for details.
+ Typical reuse patterns:
+
+ - Copying text from a string:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying some text from a string:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string, 0, string.length() - 1);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying text from char[] buffer:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, 0, buffer.length);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying some text from a char[] buffer:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, start, end - start);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying from one one Token to another:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength());
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+
+ A couple of things to note:
+
+ - clear() initializes most of the fields to default values, but not startOffset, endOffset and type.
+ - Because
TokenStreams can be chained, one cannot assume that the Token's current type is correct.
+ - The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.
+ - When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
+
+
@see org.apache.lucene.index.Payload
*/
@@ -83,16 +149,56 @@
* deprecated APIs */
private String termText;
- char[] termBuffer; // characters for the term text
- int termLength; // length of term text in buffer
+ /**
+ * Characters for the term text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link termBuffer()},
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ */
+ char[] termBuffer;
- int startOffset; // start in source text
- int endOffset; // end in source text
- String type = DEFAULT_TYPE; // lexical type
+ /**
+ * Length of term text in the buffer.
+ * @deprecated This will be made private. Instead, use:
+ * {@link termLength()}, or @{link setTermLength(int)}.
+ */
+ int termLength;
+
+ /**
+ * Start in source text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link startOffset()}, or @{link setStartOffset(int)}.
+ */
+ int startOffset;
+
+ /**
+ * End in source text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link endOffset()}, or @{link setEndOffset(int)}.
+ */
+ int endOffset;
+
+ /**
+ * The lexical type of the token.
+ * @deprecated This will be made private. Instead, use:
+ * {@link type()}, or @{link setType(String)}.
+ */
+ String type = DEFAULT_TYPE;
+
private int flags;
+ /**
+ * @deprecated This will be made private. Instead, use:
+ * {@link getPayload()}, or @{link setPayload(Payload)}.
+ */
Payload payload;
+ /**
+ * @deprecated This will be made private. Instead, use:
+ * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}.
+ */
int positionIncrement = 1;
/** Constructs a Token will null text. */
@@ -101,8 +207,8 @@
/** Constructs a Token with null text and start & end
* offsets.
- * @param start start offset
- * @param end end offset */
+ * @param start start offset in the source text
+ * @param end end offset in the source text */
public Token(int start, int end) {
startOffset = start;
endOffset = end;
@@ -110,8 +216,9 @@
/** Constructs a Token with null text and start & end
* offsets plus the Token type.
- * @param start start offset
- * @param end end offset */
+ * @param start start offset in the source text
+ * @param end end offset in the source text
+ * @param type the lexical type of this Token */
public Token(int start, int end, String typ) {
startOffset = start;
endOffset = end;
@@ -120,10 +227,10 @@
/**
* Constructs a Token with null text and start & end
- * offsets plus the Token type.
- * @param start start offset
- * @param end end offset
- * @param flags The bits to set for this token
+ * offsets plus flags.
+ * @param start start offset in the source text
+ * @param end end offset in the source text
+ * @param flags The bits to set for this token
*/
public Token(int start, int end, int flags){
startOffset = start;
@@ -138,7 +245,9 @@
* term text.
* @param text term text
* @param start start offset
- * @param end end offset */
+ * @param end end offset
+ * @deprecated
+ */
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
@@ -152,7 +261,9 @@
* @param text term text
* @param start start offset
* @param end end offset
- * @param typ token type */
+ * @param typ token type
+ * @deprecated
+ */
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
@@ -169,6 +280,7 @@
* @param start
* @param end
* @param flags token type bits
+ * @deprecated
*/
public Token(String text, int start, int end, int flags) {
termText = text;
@@ -200,6 +312,7 @@
* occur with no intervening stop words.
*
*
+ * @param positionIncrement the distance from the prior term
* @see org.apache.lucene.index.TermPositions
*/
public void setPositionIncrement(int positionIncrement) {
@@ -218,7 +331,11 @@
/** Sets the Token's term text. NOTE: for better
* indexing speed you should instead use the char[]
- * termBuffer methods to set the term text. */
+ * termBuffer methods to set the term text.
+ * @deprecated use {@link #setTermBuffer(char[], int, length)} or
+ * {@link #setTermBuffer(String)} or
+ * {@link #setTermBuffer(String, int, int)}.
+ */
public void setTermText(String text) {
termText = text;
termBuffer = null;
@@ -230,7 +347,7 @@
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
- * String, use new String(token.termBuffer(), 0, token.termLength())
+ * String, use {@link #term()}
*/
public final String termText() {
if (termText == null && termBuffer != null)
@@ -238,19 +355,68 @@
return termText;
}
+ /** Returns the Token's term text.
+ *
+ * This method has a performance penalty
+ * because the text is stored internally in a char[]. If
+ * possible, use {@link #termBuffer()} and {@link
+ * #termLength()} directly instead. If you really need a
+ * String, use this method, which is nothing more than
+ * a convenience call to new String(token.termBuffer(), 0, token.termLength())
+ */
+ public final String term() {
+ if (termText != null)
+ return termText;
+ initTermBuffer();
+ return new String(termBuffer, 0, termLength);
+ }
+
/** Copies the contents of buffer, starting at offset for
- * length characters, into the termBuffer
- * array. NOTE: for better indexing speed you
- * should instead retrieve the termBuffer, using {@link
- * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
- * fill it in directly to set the term text. This saves
- * an extra copy. */
+ * length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
public final void setTermBuffer(char[] buffer, int offset, int length) {
- resizeTermBuffer(length);
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
+ /** Copies the contents of buffer into the termBuffer array.
+ * @param buffer the buffer to copy
+ */
+ public final void setTermBuffer(String buffer) {
+ termText = null;
+ int length = buffer.length();
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /** Copies the contents of buffer, starting at offset and continuing
+ * for length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
+ public final void setTermBuffer(String buffer, int offset, int length) {
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
@@ -263,23 +429,69 @@
return termBuffer;
}
- /** Grows the termBuffer to at least size newSize.
+ /** Grows the termBuffer to at least size newSize, preserving the
+ * existing content. Note: If the next operation is to change
+ * the contents of the term buffer use
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ * to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
- initTermBuffer();
- if (newSize > termBuffer.length) {
- int size = termBuffer.length;
- while(size < newSize)
- size *= 2;
- char[] newBuffer = new char[size];
- System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
- termBuffer = newBuffer;
+ char[] newCharBuffer = growTermBuffer(newSize);
+ if (termBuffer == null) {
+ // If there were termText, then preserve it.
+ // note that if termBuffer is null then newCharBuffer cannot be null
+ assert newCharBuffer != null;
+ if (termText != null) {
+ termText.getChars(0, termText.length(), newCharBuffer, 0);
+ }
+ termBuffer = newCharBuffer;
+ } else if (newCharBuffer != null) {
+ // Note: if newCharBuffer != null then termBuffer needs to grow.
+ // If there were a termBuffer, then preserve it
+ System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
+ termBuffer = newCharBuffer;
}
+ termText = null;
return termBuffer;
}
+ /** Allocates a buffer char[] of at least newSize
+ * @param newSize minimum size of the buffer
+ * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
+ */
+ private char[] growTermBuffer(int newSize) {
+ if (termBuffer != null) {
+ if (termBuffer.length >= newSize)
+ // Already big enough
+ return null;
+ else
+ // Not big enough; create a new array with slight
+ // over allocation:
+ return new char[ArrayUtil.getNextSize(newSize)];
+ } else {
+
+ // determine the best size
+ // The buffer is always at least MIN_BUFFER_SIZE
+ if (newSize < MIN_BUFFER_SIZE) {
+ newSize = MIN_BUFFER_SIZE;
+ }
+
+ // If there is already a termText, then the size has to be at least that big
+ if (termText != null) {
+ int ttLength = termText.length();
+ if (newSize < ttLength) {
+ newSize = ttLength;
+ }
+ }
+
+ return new char[newSize];
+ }
+ }
+
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
@@ -308,9 +520,16 @@
}
/** Set number of valid characters (length of the term) in
- * the termBuffer array. */
+ * the termBuffer array. Use this to truncate the termBuffer
+ * or to synchronize with external manipulation of the termBuffer.
+ * Note: to grow the size of the array,
+ * use {@link #resizeTermBuffer(int)} first.
+ * @param length the truncated length
+ */
public final void setTermLength(int length) {
initTermBuffer();
+ if (length > termBuffer.length)
+ throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
termLength = length;
}
@@ -331,7 +550,8 @@
}
/** Returns this Token's ending offset, one greater than the position of the
- last character corresponding to this token in the source text. */
+ last character corresponding to this token in the source text. The length
+ of the token in the source text is (endOffset - startOffset). */
public final int endOffset() {
return endOffset;
}
@@ -374,8 +594,6 @@
this.flags = flags;
}
-
-
/**
* Returns this Token's payload.
*/
@@ -424,9 +642,9 @@
public Object clone() {
try {
Token t = (Token)super.clone();
+ // Do a deep clone
if (termBuffer != null) {
- t.termBuffer = null;
- t.setTermBuffer(termBuffer, 0, termLength);
+ t.termBuffer = (char[]) termBuffer.clone();
}
if (payload != null) {
t.setPayload((Payload) payload.clone());
Index: lucene/src/java/org/apache/lucene/analysis/TokenFilter.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/TokenFilter.java (revision 682416)
+++ lucene/src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
@@ -23,7 +23,7 @@
This is an abstract class.
NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Index: lucene/src/java/org/apache/lucene/analysis/TokenStream.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/TokenStream.java (revision 682416)
+++ lucene/src/java/org/apache/lucene/analysis/TokenStream.java (working copy)
@@ -31,14 +31,15 @@
{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
- NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ NOTE: subclasses must override {@link #next(Token)}. It's
+ also OK to instead override {@link #next()} but that
+ method is now deprecated in favor of {@link #next(Token)}.
*/
public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS.
- * The returned Token is a "full private copy" (not
+ * @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
* than calling {@link #next(Token)} instead.. */
public Token next() throws IOException {