Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 924661) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -64,14 +64,14 @@ implementing the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of the constructors that starts with null text. To load - the token from a char[] use {@link #setTermBuffer(char[], int, int)}. - To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}. - Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + the token from a char[] use {@link #copyBuffer(char[], int, int)}. + To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. + Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, if you know that your text is shorter than the capacity of the termBuffer - or {@link #resizeTermBuffer(int)}, if there is any possibility + or {@link #resizeBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, - or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to set the length of the term text. See LUCENE-969 for details.

@@ -100,7 +100,7 @@
  • Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
    -    return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
    +    return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
       
  • @@ -172,7 +172,7 @@ * @param end end offset */ public Token(String text, int start, int end) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; } @@ -187,7 +187,7 @@ * @param typ token type */ public Token(String text, int start, int end, String typ) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; type = typ; @@ -204,7 +204,7 @@ * @param flags token type bits */ public Token(String text, int start, int end, int flags) { - setTermBuffer(text); + append(text); startOffset = start; endOffset = end; this.flags = flags; @@ -221,7 +221,7 @@ * @param end */ public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { - setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); + copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); startOffset = start; endOffset = end; } @@ -270,7 +270,7 @@ corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be - equal to {@link #termLength}, as the term text may have been altered by a + equal to {@link #length}, as the term text may have been altered by a stemmer or some other filter. */ public final int startOffset() { return startOffset; @@ -351,7 +351,7 @@ @Override public String toString() { final StringBuilder sb = new StringBuilder(); - sb.append('(').append(term()).append(',') + sb.append('(').append(super.toString()).append(',') .append(startOffset).append(',').append(endOffset); if (!"word".equals(type)) sb.append(",type=").append(type); @@ -387,7 +387,7 @@ /** Makes a clone, but replaces the term buffer & * start/end offset in the process. This is more * efficient than doing a full clone (and then calling - * setTermBuffer) because it saves a wasted copy of the old + * {@link #copyBuffer}) because it saves a wasted copy of the old * termBuffer. */ public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); @@ -442,16 +442,16 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset}, * {@link #setType} * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { clearNoTermBuffer(); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); payload = null; positionIncrement = 1; - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -459,14 +459,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(char[], int, int)}, + * {@link #copyBuffer(char[], int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { clearNoTermBuffer(); - setTermBuffer(newTermBuffer, newTermOffset, newTermLength); + copyBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -474,14 +474,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -489,14 +489,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; @@ -504,14 +504,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String)}, + * {@link #append(CharSequence)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm); + clear(); + append(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -519,14 +519,14 @@ } /** Shorthand for calling {@link #clear}, - * {@link #setTermBuffer(String, int, int)}, + * {@link #append(CharSequence, int, int)}, * {@link #setStartOffset}, * {@link #setEndOffset} * {@link #setType} on Token.DEFAULT_TYPE * @return this Token instance */ public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { - clearNoTermBuffer(); - setTermBuffer(newTerm, newTermOffset, newTermLength); + clear(); + append(newTerm, newTermOffset, newTermOffset + newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; @@ -538,7 +538,7 @@ * @param prototype */ public void reinit(Token prototype) { - setTermBuffer(prototype.termBuffer(), 0, prototype.termLength()); + copyBuffer(prototype.buffer(), 0, prototype.length()); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -553,7 +553,7 @@ * @param newTerm */ public void reinit(Token prototype, String newTerm) { - setTermBuffer(newTerm); + setEmpty().append(newTerm); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; @@ -570,7 +570,7 @@ * @param length */ public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { - setTermBuffer(newTermBuffer, offset, length); + copyBuffer(newTermBuffer, offset, length); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; Index: src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttribute.java (revision 0) @@ -0,0 +1,71 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +/** + * The term text of a Token. + */ +public interface CharTermAttribute extends Attribute, CharSequence, Appendable { + + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void copyBuffer(char[] buffer, int offset, int length); + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public char[] buffer(); + + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeBuffer(int newSize); + + /** Set number of valid characters (length of the term) in + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeBuffer(int)} first. + * @param length the truncated length + */ + public CharTermAttribute setLength(int length); + + /** Sets the length of the termBuffer to zero. + * Use this method before appending contents + * using the {@link Appendable} interface. + */ + public CharTermAttribute setEmpty(); + + // the following methods are redefined to get rid of IOException declaration: + public CharTermAttribute append(CharSequence csq); + public CharTermAttribute append(CharSequence csq, int start, int end); + public CharTermAttribute append(char c); + +} Property changes on: src\java\org\apache\lucene\analysis\tokenattributes\CharTermAttribute.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 0) @@ -0,0 +1,256 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.nio.CharBuffer; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; + +/** + * The term text of a Token. + */ +public class CharTermAttributeImpl extends AttributeImpl implements CharTermAttribute, TermAttribute, TermToBytesRefAttribute, Cloneable, Serializable { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer = new char[ArrayUtil.oversize(MIN_BUFFER_SIZE, RamUsageEstimator.NUM_BYTES_CHAR)]; + private int termLength = 0; + + @Deprecated + public String term() { + // don't delegate to toString() here! + return new String(termBuffer, 0, termLength); + } + + public void copyBuffer(char[] buffer, int offset, int length) { + growTermBuffer(length); + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + @Deprecated + public void setTermBuffer(char[] buffer, int offset, int length) { + copyBuffer(buffer, offset, length); + } + + @Deprecated + public void setTermBuffer(String buffer) { + int length = buffer.length(); + growTermBuffer(length); + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + @Deprecated + public void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + growTermBuffer(length); + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + + public char[] buffer() { + return termBuffer; + } + + @Deprecated + public char[] termBuffer() { + return termBuffer; + } + + public char[] resizeBuffer(int newSize) { + if (termBuffer == null) { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } else { + if(termBuffer.length < newSize){ + // Not big enough; create a new array with slight + // over allocation and preserve content + final char[] newCharBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + } + return termBuffer; + } + + @Deprecated + public char[] resizeTermBuffer(int newSize) { + return resizeBuffer(newSize); + } + + private void growTermBuffer(int newSize) { + if (termBuffer == null) { + // The buffer is always at least MIN_BUFFER_SIZE + termBuffer = new char[ArrayUtil.oversize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } else { + if(termBuffer.length < newSize){ + // Not big enough; create a new array with slight + // over allocation: + termBuffer = new char[ArrayUtil.oversize(newSize, RamUsageEstimator.NUM_BYTES_CHAR)]; + } + } + } + + @Deprecated + public int termLength() { + return termLength; + } + + public CharTermAttribute setLength(int length) { + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); + termLength = length; + return this; + } + + public CharTermAttribute setEmpty() { + termLength = 0; + return this; + } + + @Deprecated + public void setTermLength(int length) { + setLength(length); + } + + // *** TermToBytesRefAttribute interface *** + public int toBytesRef(BytesRef target) { + // nocommit: Maybe assume that bytes is already initialized? TermsHashPerField ensures this. + if (target.bytes == null) { + target.bytes = new byte[termLength * 4]; + } + return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); + } + + // *** CharSequence interface *** + public int length() { + return termLength; + } + + public char charAt(int index) { + if (index >= termLength) + throw new IndexOutOfBoundsException(); + return termBuffer[index]; + } + + public CharSequence subSequence(final int start, final int end) { + if (start > termLength || end > termLength) + throw new IndexOutOfBoundsException(); + return new String(termBuffer, start, end - start); + } + + // *** Appendable interface *** + public CharTermAttribute append(CharSequence csq) { + return append(csq, 0, csq.length()); + } + + public CharTermAttribute append(CharSequence csq, int start, int end) { + resizeBuffer(termLength + end - start); + if (csq instanceof String) { + ((String) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof StringBuilder) { + ((StringBuilder) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof StringBuffer) { + ((StringBuffer) csq).getChars(start, end, termBuffer, termLength); + } else if (csq instanceof CharBuffer && ((CharBuffer) csq).hasArray()) { + final CharBuffer cb = (CharBuffer) csq; + System.arraycopy(cb.array(), cb.arrayOffset() + cb.position() + start, termBuffer, termLength, end - start); + } else { + while (start < end) + termBuffer[termLength++] = csq.charAt(start++); + // no fall-through here, as termLength is updated! + return this; + } + termLength += end - start; + return this; + } + + public CharTermAttribute append(char c) { + resizeBuffer(termLength + 1)[termLength++] = c; + return this; + } + + // *** AttributeImpl *** + + @Override + public int hashCode() { + int code = termLength; + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); + return code; + } + + @Override + public void clear() { + termLength = 0; + } + + @Override + public Object clone() { + CharTermAttributeImpl t = (CharTermAttributeImpl)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = termBuffer.clone(); + } + return t; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof CharTermAttributeImpl) { + final CharTermAttributeImpl o = ((CharTermAttributeImpl) other); + if (termLength != o.termLength) + return false; + for(int i=0;i> it = clone.getAttributeClassesIterator(); - assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("FlagsAttribute must be the first attribute", FlagsAttribute.class, it.next()); assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); assertFalse("No more attributes", it.hasNext()); - final TermAttribute termAtt2 = clone.getAttribute(TermAttribute.class); + final FlagsAttribute flagsAtt2 = clone.getAttribute(FlagsAttribute.class); final TypeAttribute typeAtt2 = clone.getAttribute(TypeAttribute.class); - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); // test copy back - termAtt2.setTermBuffer("OtherTerm"); + flagsAtt2.setFlags(4711); typeAtt2.setType("OtherType"); clone.copyTo(src); - assertEquals("TermAttribute of original must now contain updated term", "OtherTerm", termAtt.term()); + assertEquals("FlagsAttribute of original must now contain updated term", 4711, flagsAtt.getFlags()); assertEquals("TypeAttribute of original must now contain updated type", "OtherType", typeAtt.type()); // verify again: - assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("FlagsAttribute of original and clone must be different instances", flagsAtt2, flagsAtt); assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); - assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("FlagsAttribute of original and clone must be equal", flagsAtt2, flagsAtt); assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); } public void testToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); - TermAttribute termAtt = src.addAttribute(TermAttribute.class); + CharTermAttribute termAtt = src.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = src.addAttribute(TypeAttribute.class); - termAtt.setTermBuffer("TestTerm"); + termAtt.append("TestTerm"); typeAtt.setType("TestType"); assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString()); Iterator it = src.getAttributeImplsIterator(); @@ -125,23 +125,23 @@ src = new AttributeSource(); src.addAttributeImpl(new Token()); - // this should not add a new attribute as Token implements TermAttribute, too - termAtt = src.addAttribute(TermAttribute.class); - assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token); + // this should not add a new attribute as Token implements CharTermAttribute, too + termAtt = src.addAttribute(CharTermAttribute.class); + assertTrue("CharTermAttribute should be implemented by Token", termAtt instanceof Token); // get the Token attribute and check, that it is the only one it = src.getAttributeImplsIterator(); Token tok = (Token) it.next(); assertFalse("There should be only one attribute implementation instance", it.hasNext()); - termAtt.setTermBuffer("TestTerm"); + termAtt.setEmpty().append("TestTerm"); assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString()); } public void testDefaultAttributeFactory() throws Exception { AttributeSource src = new AttributeSource(); - assertTrue("TermAttribute is not implemented by TermAttributeImpl", - src.addAttribute(TermAttribute.class) instanceof TermAttributeImpl); + assertTrue("CharTermAttribute is not implemented by CharTermAttributeImpl", + src.addAttribute(CharTermAttribute.class) instanceof CharTermAttributeImpl); assertTrue("OffsetAttribute is not implemented by OffsetAttributeImpl", src.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl); assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",