Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (revision 890476) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java (working copy) @@ -61,26 +61,19 @@ @Override public boolean incrementToken() throws IOException { - boolean result = false; if (input.incrementToken()) { final char[] buffer = termAtt.termBuffer(); final int length = termAtt.termLength(); - //look for the delimiter - boolean seen = false; for (int i = 0; i < length; i++) { if (buffer[i] == delimiter) { - termAtt.setTermBuffer(buffer, 0, i); + termAtt.setTermLength(i); // simply set a new length payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); - seen = true; - break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same + return true; } } - if (seen == false) { - //no delimiter - payAtt.setPayload(null); - } - result = true; - } - return result; + // we have not seen the delimiter + payAtt.setPayload(null); + return true; + } else return false; } } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java (revision 890476) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java (working copy) @@ -18,9 +18,9 @@ import org.apache.lucene.index.Payload; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.util.Arrays; -import java.io.UnsupportedEncodingException; /** @@ -30,28 +30,29 @@ public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{ protected Charset charset = Charset.forName("UTF-8"); - protected String charsetName = "UTF-8"; //argh, stupid 1.4 + + @Deprecated + protected String charsetName = charset.name(); public IdentityEncoder() { } public IdentityEncoder(Charset charset) { this.charset = charset; + // @deprecated, remove this in 4.0: charsetName = charset.name(); } public Payload encode(char[] buffer, int offset, int length) { - //what's the most efficient way to get a byte [] from a char[] array - //Do we have to go through String? - String tmp = new String(buffer, offset, length); - Payload result = null;//Can we avoid allocating by knowing where using the new API? - try { - result = new Payload(tmp.getBytes(charsetName)); - } catch (UnsupportedEncodingException e) { - //should never hit this, since we get the name from the Charset + final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length)); + if (bb.hasArray()) { + return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()); + } else { + // normally it should always have an array, but who knows? + final byte[] b = new byte[bb.remaining()]; + bb.get(b); + return new Payload(b); } - - return result; } } Index: contrib/CHANGES.txt =================================================================== --- contrib/CHANGES.txt (revision 890476) +++ contrib/CHANGES.txt (working copy) @@ -65,6 +65,12 @@ into core, and moved the ICU-based collation support into contrib/icu. (Robert Muir) +Optimizations + + * LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer + over itsself. Instead it sets only the length. This patch also optimizes + the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler) + Test Cases * LUCENE-2115: Cutover contrib tests to use Java5 generics. (Kay Kay