Index: common-build.xml =================================================================== --- common-build.xml (revision 684150) +++ common-build.xml (working copy) @@ -78,6 +78,7 @@ + Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 684150) +++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy) @@ -141,34 +141,34 @@ private final class TestFilter extends TokenFilter { - private org.apache.lucene.analysis.Token prevToken; + private Token prevToken; public TestFilter(TokenStream in) { super(in); } - public final org.apache.lucene.analysis.Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { if (multiToken > 0) { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token("multi"+(multiToken+1), prevToken.startOffset(), - prevToken.endOffset(), prevToken.type()); + token.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type()); token.setPositionIncrement(0); multiToken--; return token; } else { - org.apache.lucene.analysis.Token t = input.next(); - prevToken = t; - if (t == null) + token = input.next(token); + if (token == null) { + prevToken = null; return null; - String text = t.termText(); + } + prevToken = (Token) token.clone(); + String text = token.term(); if (text.equals("triplemulti")) { multiToken = 2; - return t; + return token; } else if (text.equals("multi")) { multiToken = 1; - return t; + return token; } else { - return t; + return token; } } } @@ -197,20 +197,14 @@ super(in); } - public final org.apache.lucene.analysis.Token next() throws java.io.IOException { - for (Token t = input.next(); t != null; t = input.next()) { - if (t.termText().equals("the")) { + public final Token next(Token token) throws java.io.IOException { + for (token = input.next(token); token != null; token = input.next(token)) { + if (token.term().equals("the")) { // stopword, do nothing - } else if (t.termText().equals("quick")) { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(), - t.endOffset(), t.type()); + } else if (token.term().equals("quick")) { token.setPositionIncrement(2); return token; } else { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(), - t.endOffset(), t.type()); token.setPositionIncrement(1); return token; } Index: src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (revision 684150) +++ src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (working copy) @@ -319,7 +319,7 @@ } private static class EmptyTokenStream extends TokenStream { - public Token next() { + public Token next(Token token) { return null; } } Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 684150) +++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -75,18 +75,18 @@ boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (inPhrase) { inPhrase = false; - return new Token("phrase2", savedStart, savedEnd); + return token.reinit("phrase2", savedStart, savedEnd); } else - for (Token token = input.next(); token != null; token = input.next()) { - if (token.termText().equals("phrase")) { + for (token = input.next(token); token != null; token = input.next(token)) { + if (token.term().equals("phrase")) { inPhrase = true; savedStart = token.startOffset(); savedEnd = token.endOffset(); - return new Token("phrase1", savedStart, savedEnd); - } else if (!token.termText().equals("stop")) + return token.reinit("phrase1", savedStart, savedEnd); + } else if (!token.term().equals("stop")) return token; } return null; Index: src/test/org/apache/lucene/analysis/TestToken.java =================================================================== --- src/test/org/apache/lucene/analysis/TestToken.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestToken.java (working copy) @@ -17,7 +17,6 @@ * limitations under the License. */ -import java.io.*; import org.apache.lucene.util.LuceneTestCase; public class TestToken extends LuceneTestCase { @@ -26,6 +25,119 @@ super(name); } + public void testCtor() throws Exception { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + char[] buf = t.termBuffer(); + assertNotSame(t.termBuffer(), content); + assertEquals("hello", t.term()); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", t.term()); + assertEquals("(hello,6,22)", t.toString()); + assertEquals("word", t.type()); + assertEquals(0, t.getFlags()); + + t = new Token(6, 22, 7); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", t.term()); + assertEquals("(hello,6,22)", t.toString()); + assertEquals(7, t.getFlags()); + + t = new Token(6, 22, "junk"); + t.setTermBuffer(content, 0, content.length); + assertEquals("hello", t.term()); + assertEquals("(hello,6,22,type=junk)", t.toString()); + assertEquals(0, t.getFlags()); + } + + public void testResize() { + Token t = new Token(); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, content.length); + for (int i = 0; i < 2000; i++) + { + t.resizeTermBuffer(i); + assertTrue(i <= t.termBuffer().length); + assertEquals("hello", t.term()); + } + } + + public void testGrow() { + Token t = new Token(); + StringBuffer buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + char[] content = buf.toString().toCharArray(); + t.setTermBuffer(content, 0, content.length); + assertEquals(buf.length(), t.termLength()); + assertEquals(buf.toString(), t.term()); + buf.append(buf.toString()); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, first variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content, 0, content.length()); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // now as a string, second variant + t = new Token(); + buf = new StringBuffer("ab"); + for (int i = 0; i < 20; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append(content); + } + assertEquals(1048576, t.termLength()); + assertEquals(1179654, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + + // Test for slow growth to a long term + t = new Token(); + buf = new StringBuffer("a"); + for (int i = 0; i < 20000; i++) + { + String content = buf.toString(); + t.setTermBuffer(content); + assertEquals(content.length(), t.termLength()); + assertEquals(content, t.term()); + buf.append("a"); + } + assertEquals(20000, t.termLength()); + assertEquals(20331, t.termBuffer().length); + } + public void testToString() throws Exception { char[] b = {'a', 'l', 'o', 'h', 'a'}; Token t = new Token("", 0, 5); @@ -40,10 +152,10 @@ Token t = new Token("hello", 0, 5); assertEquals(t.termText(), "hello"); assertEquals(t.termLength(), 5); - assertEquals(new String(t.termBuffer(), 0, 5), "hello"); + assertEquals(t.term(), "hello"); t.setTermText("hello2"); assertEquals(t.termLength(), 6); - assertEquals(new String(t.termBuffer(), 0, 6), "hello2"); + assertEquals(t.term(), "hello2"); t.setTermBuffer("hello3".toCharArray(), 0, 6); assertEquals(t.termText(), "hello3"); @@ -53,4 +165,13 @@ buffer[1] = 'o'; assertEquals(t.termText(), "hollo3"); } + + public void testClone() throws Exception { + Token t = new Token(0, 5); + char[] content = "hello".toCharArray(); + t.setTermBuffer(content, 0, 5); + char[] buf = t.termBuffer(); + Token copy = (Token) t.clone(); + assertNotSame(buf, copy.termBuffer()); + } } Index: src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java =================================================================== --- src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (working copy) @@ -29,16 +29,16 @@ TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); - Token token = tokenStream.next(); + Token token = tokenStream.next(new Token()); assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", - token.termText()); + token.term()); tokenStream = analyzer.tokenStream("special", new StringReader(text)); - token = tokenStream.next(); + token = tokenStream.next(token); assertEquals("SimpleAnalyzer lowercases", "qwerty", - token.termText()); + token.term()); } } Index: src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java =================================================================== --- src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (working copy) @@ -16,10 +16,10 @@ * limitations under the License. */ -import junit.framework.TestCase; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; import java.io.StringReader; @@ -29,7 +29,7 @@ /** * tests for the TeeTokenFilter and SinkTokenizer */ -public class TeeSinkTokenTest extends TestCase { +public class TeeSinkTokenTest extends LuceneTestCase { protected StringBuffer buffer1; protected StringBuffer buffer2; protected String[] tokens1; @@ -63,23 +63,23 @@ SinkTokenizer sink1 = new SinkTokenizer(null) { public void add(Token t) { - if (t != null && t.termText().equalsIgnoreCase("The")) { + if (t != null && t.term().equalsIgnoreCase("The")) { super.add(t); } } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); - Token token = null; int i = 0; - while ((token = source.next()) != null) { - assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token token = source.next(reusableToken); token != null; token = source.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; - while ((token = sink1.next()) != null) { - assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true); + for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size()); @@ -88,54 +88,54 @@ public void testMultipleSources() throws Exception { SinkTokenizer theDetector = new SinkTokenizer(null) { public void add(Token t) { - if (t != null && t.termText().equalsIgnoreCase("The")) { + if (t != null && t.term().equalsIgnoreCase("The")) { super.add(t); } } }; SinkTokenizer dogDetector = new SinkTokenizer(null) { public void add(Token t) { - if (t != null && t.termText().equalsIgnoreCase("Dogs")) { + if (t != null && t.term().equalsIgnoreCase("Dogs")) { super.add(t); } } }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); - Token token = null; int i = 0; - while ((token = source1.next()) != null) { - assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token token = source1.next(reusableToken); token != null; token = source1.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; - while ((token = source2.next()) != null) { - assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true); + for (Token token = source2.next(reusableToken); token != null; token = source2.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + tokens2[i], token.term().equals(tokens2[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; - while ((token = theDetector.next()) != null) { - assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true); + for (Token token = theDetector.next(reusableToken); token != null; token = theDetector.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; - while ((token = dogDetector.next()) != null) { - assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true); + for (Token token = dogDetector.next(reusableToken); token != null; token = dogDetector.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + "Dogs", token.term().equalsIgnoreCase("Dogs") == true); i++; } assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size()); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; - while ((token = lowerCasing.next()) != null) { - assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true); + for (Token token = lowerCasing.next(reusableToken); token != null; token = lowerCasing.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + tokens1[i].toLowerCase(), token.term().equals(tokens1[i].toLowerCase()) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); @@ -157,14 +157,14 @@ } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100); - Token next = new Token(); + final Token reusableToken = new Token(); TokenStream result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - while ((next = result.next(next)) != null) { + while (result.next(reusableToken) != null) { } result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); - next = new Token(); List tmp = new ArrayList(); - while ((next = result.next(next)) != null) { + Token next; + while ((next = result.next(reusableToken)) != null) { tmp.add(next.clone()); } List sinkList = sink.getTokens(); @@ -172,7 +172,7 @@ for (int i = 0; i < tmp.size(); i++) { Token tfTok = (Token) tmp.get(i); Token sinkTok = (Token) sinkList.get(i); - assertTrue(tfTok.termText() + " is not equal to " + sinkTok.termText() + " at token: " + i, tfTok.termText().equals(sinkTok.termText()) == true); + assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true); } //simulate two fields, each being analyzed once, for 20 documents @@ -180,14 +180,12 @@ int tfPos = 0; long start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { - next = new Token(); result = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); - while ((next = result.next(next)) != null) { + while ((next = result.next(reusableToken)) != null) { tfPos += next.getPositionIncrement(); } - next = new Token(); result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); - while ((next = result.next(next)) != null) { + while ((next = result.next(reusableToken)) != null) { tfPos += next.getPositionIncrement(); } } @@ -198,14 +196,13 @@ start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]); - next = new Token(); result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - while ((next = result.next(next)) != null) { + while ((next = result.next(reusableToken)) != null) { sinkPos += next.getPositionIncrement(); } //System.out.println("Modulo--------"); result = sink; - while ((next = result.next(next)) != null) { + while ((next = result.next(reusableToken)) != null) { sinkPos += next.getPositionIncrement(); } } @@ -254,7 +251,7 @@ public void add(Token t) { if (t != null && count % modCount == 0) { - lst.add(t.clone()); + super.add(t); } count++; } Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy) @@ -42,11 +42,11 @@ TokenStream stream = new TokenStream() { private int index = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (index == tokens.length) { return null; } else { - return new Token(tokens[index++], 0, 0); + return token.reinit(tokens[index++], 0, 0); } } @@ -91,10 +91,10 @@ private void checkTokens(TokenStream stream) throws IOException { int count = 0; - Token token; - while ((token = stream.next()) != null) { + final Token reusableToken = new Token(); + for (Token token = stream.next(reusableToken); token != null; token = stream.next(token)) { assertTrue(count < tokens.length); - assertEquals(tokens[count], token.termText()); + assertEquals(tokens[count], token.term()); count++; } Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy) @@ -35,10 +35,11 @@ public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i = 0; i < expectedImages.length; i++) { - Token t = ts.next(); + t = ts.next(t); assertNotNull(t); - assertEquals(expectedImages[i], t.termText()); + assertEquals(expectedImages[i], t.term()); if (expectedTypes != null) { assertEquals(expectedTypes[i], t.type()); } @@ -46,7 +47,7 @@ assertEquals(expectedPosIncrs[i], t.getPositionIncrement()); } } - assertNull(ts.next()); + assertNull(ts.next(t)); ts.close(); } Index: src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (working copy) @@ -25,81 +25,82 @@ public void testU() throws Exception { TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl")); ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream); - assertEquals("Des", filter.next().termText()); - assertEquals("mot", filter.next().termText()); - assertEquals("cles", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("LA", filter.next().termText()); - assertEquals("CHAINE", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("AE", filter.next().termText()); - assertEquals("C", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("IJ", filter.next().termText()); - assertEquals("D", filter.next().termText()); - assertEquals("N", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("OE", filter.next().termText()); - assertEquals("TH", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("Y", filter.next().termText()); - assertEquals("Y", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("ae", filter.next().termText()); - assertEquals("c", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("ij", filter.next().termText()); - assertEquals("d", filter.next().termText()); - assertEquals("n", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("oe", filter.next().termText()); - assertEquals("ss", filter.next().termText()); - assertEquals("th", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("y", filter.next().termText()); - assertEquals("y", filter.next().termText()); - assertEquals("fi", filter.next().termText()); - assertEquals("fl", filter.next().termText()); - assertNull(filter.next()); + Token token = new Token(); + assertEquals("Des", filter.next(token).term()); + assertEquals("mot", filter.next(token).term()); + assertEquals("cles", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("LA", filter.next(token).term()); + assertEquals("CHAINE", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("A", filter.next(token).term()); + assertEquals("AE", filter.next(token).term()); + assertEquals("C", filter.next(token).term()); + assertEquals("E", filter.next(token).term()); + assertEquals("E", filter.next(token).term()); + assertEquals("E", filter.next(token).term()); + assertEquals("E", filter.next(token).term()); + assertEquals("I", filter.next(token).term()); + assertEquals("I", filter.next(token).term()); + assertEquals("I", filter.next(token).term()); + assertEquals("I", filter.next(token).term()); + assertEquals("IJ", filter.next(token).term()); + assertEquals("D", filter.next(token).term()); + assertEquals("N", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("O", filter.next(token).term()); + assertEquals("OE", filter.next(token).term()); + assertEquals("TH", filter.next(token).term()); + assertEquals("U", filter.next(token).term()); + assertEquals("U", filter.next(token).term()); + assertEquals("U", filter.next(token).term()); + assertEquals("U", filter.next(token).term()); + assertEquals("Y", filter.next(token).term()); + assertEquals("Y", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("a", filter.next(token).term()); + assertEquals("ae", filter.next(token).term()); + assertEquals("c", filter.next(token).term()); + assertEquals("e", filter.next(token).term()); + assertEquals("e", filter.next(token).term()); + assertEquals("e", filter.next(token).term()); + assertEquals("e", filter.next(token).term()); + assertEquals("i", filter.next(token).term()); + assertEquals("i", filter.next(token).term()); + assertEquals("i", filter.next(token).term()); + assertEquals("i", filter.next(token).term()); + assertEquals("ij", filter.next(token).term()); + assertEquals("d", filter.next(token).term()); + assertEquals("n", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("o", filter.next(token).term()); + assertEquals("oe", filter.next(token).term()); + assertEquals("ss", filter.next(token).term()); + assertEquals("th", filter.next(token).term()); + assertEquals("u", filter.next(token).term()); + assertEquals("u", filter.next(token).term()); + assertEquals("u", filter.next(token).term()); + assertEquals("u", filter.next(token).term()); + assertEquals("y", filter.next(token).term()); + assertEquals("y", filter.next(token).term()); + assertEquals("fi", filter.next(token).term()); + assertEquals("fl", filter.next(token).term()); + assertNull(filter.next(token)); } } Index: src/test/org/apache/lucene/analysis/TestLengthFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy) @@ -27,10 +27,11 @@ TokenStream stream = new WhitespaceTokenizer( new StringReader("short toolong evenmuchlongertext a ab toolong foo")); LengthFilter filter = new LengthFilter(stream, 2, 6); - assertEquals("short", filter.next().termText()); - assertEquals("ab", filter.next().termText()); - assertEquals("foo", filter.next().termText()); - assertNull(filter.next()); + Token token = new Token(); + assertEquals("short", filter.next(token).term()); + assertEquals("ab", filter.next(token).term()); + assertEquals("foo", filter.next(token).term()); + assertNull(filter.next(token)); } } Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 684150) +++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -17,13 +17,14 @@ * limitations under the License. */ -import java.io.*; +import java.io.IOException; +import java.io.StringReader; +import java.util.LinkedList; import java.util.List; -import java.util.LinkedList; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.index.Payload; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.index.Payload; -import org.apache.lucene.analysis.standard.StandardTokenizer; public class TestAnalyzers extends LuceneTestCase { @@ -35,12 +36,14 @@ String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + Token t; for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled")); stpf.setEnablePositionIncrements(enableIcrements); + Token t = new Token(); for (int i=0; i<20; i+=3) { - Token t = stpf.next(); + t = stpf.next(t); log("Token "+i+": "+t); String w = English.intToEnglish(i).trim(); - assertEquals("expecting token "+i+" to be "+w,w,t.termText()); + assertEquals("expecting token "+i+" to be "+w,w,t.term()); assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement()); } - assertNull(stpf.next()); + assertNull(stpf.next(t)); } // print debug info depending on VERBOSE Index: src/test/org/apache/lucene/AnalysisTest.java =================================================================== --- src/test/org/apache/lucene/AnalysisTest.java (revision 684150) +++ src/test/org/apache/lucene/AnalysisTest.java (working copy) @@ -70,9 +70,10 @@ Date start = new Date(); int count = 0; - for (Token t = stream.next(); t!=null; t = stream.next()) { + final Token reusableToken = new Token(); + for (Token t = stream.next(reusableToken); t!=null; t = stream.next(reusableToken)) { if (verbose) { - System.out.println("Text=" + new String(t.termBuffer(), 0, t.termLength()) + System.out.println("Text=" + t.term() + " start=" + t.startOffset() + " end=" + t.endOffset()); } Index: src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 684150) +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -49,13 +49,13 @@ private final int[] INCREMENTS = {1, 2, 1, 0, 1}; private int i = 0; - public Token next() { + public Token next(Token token) { if (i == TOKENS.length) return null; - Token t = new Token(TOKENS[i], i, i); - t.setPositionIncrement(INCREMENTS[i]); + token.reinit(TOKENS[i], i, i); + token.setPositionIncrement(INCREMENTS[i]); i++; - return t; + return token; } }; } @@ -204,11 +204,9 @@ Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream("field", new StringReader("one two three four five")); - - while (true) { - Token token = ts.next(); - if (token == null) break; - assertEquals(token.termText(), 1, token.getPositionIncrement()); + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + assertEquals(token.term(), 1, token.getPositionIncrement()); } } } Index: src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java =================================================================== --- src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (revision 684150) +++ src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (working copy) @@ -16,22 +16,32 @@ * limitations under the License. */ -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.*; +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Payload; import org.apache.lucene.index.Term; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; -import java.io.IOException; -import java.io.Reader; - public class TestBoostingTermQuery extends LuceneTestCase { private IndexSearcher searcher; private BoostingSimilarity similarity = new BoostingSimilarity(); @@ -62,8 +72,8 @@ this.fieldName = fieldName; } - public Token next() throws IOException { - Token result = input.next(); + public Token next(Token token) throws IOException { + Token result = input.next(token); if (result != null) { if (fieldName.equals("field")) { result.setPayload(new Payload(payloadField)); Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java =================================================================== --- src/test/org/apache/lucene/index/TestTermVectorsReader.java (revision 684150) +++ src/test/org/apache/lucene/index/TestTermVectorsReader.java (working copy) @@ -118,20 +118,17 @@ private class MyTokenStream extends TokenStream { int tokenUpto; - public Token next() { + public Token next(Token token) { if (tokenUpto >= tokens.length) return null; else { - final Token t = new Token(); final TestToken testToken = tokens[tokenUpto++]; - t.setTermText(testToken.text); + token.reinit(testToken.text, testToken.startOffset, testToken.endOffset); if (tokenUpto > 1) - t.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); + token.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); else - t.setPositionIncrement(testToken.pos+1); - t.setStartOffset(testToken.startOffset); - t.setEndOffset(testToken.endOffset); - return t; + token.setPositionIncrement(testToken.pos+1); + return token; } } } Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 684150) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -1786,11 +1786,11 @@ return new TokenFilter(new StandardTokenizer(reader)) { private int count = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (count++ == 5) { throw new IOException(); } - return input.next(); + return input.next(token); } }; } @@ -3574,13 +3574,13 @@ public void testNegativePositions() throws Throwable { SinkTokenizer tokens = new SinkTokenizer(); Token t = new Token(); - t.setTermText("a"); + t.setTermBuffer("a"); t.setPositionIncrement(0); tokens.add(t); - t.setTermText("b"); + t.setTermBuffer("b"); t.setPositionIncrement(1); tokens.add(t); - t.setTermText("c"); + t.setTermBuffer("c"); tokens.add(t); MockRAMDirectory dir = new MockRAMDirectory(); Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 684150) +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy) @@ -103,12 +103,12 @@ super(input); } - public Token next() throws IOException { - Token t = input.next(); - if (t != null) { - t.setPayload(new Payload(new byte[] { (byte) count++ })); + public Token next(Token token) throws IOException { + token = input.next(token); + if (token != null) { + token.setPayload(new Payload(new byte[] { (byte) count++ })); } - return t; + return token; } } Index: src/test/org/apache/lucene/index/TestTermdocPerf.java =================================================================== --- src/test/org/apache/lucene/index/TestTermdocPerf.java (revision 684150) +++ src/test/org/apache/lucene/index/TestTermdocPerf.java (working copy) @@ -40,11 +40,12 @@ Token t; public RepeatingTokenStream(String val) { - t = new Token(val,0,val.length()); + t = new Token(0,val.length()); + t.setTermBuffer(val); } - public Token next() throws IOException { - return --num<0 ? null : t; + public Token next(Token token) throws IOException { + return --num<0 ? null : (Token) t.clone(); } } Index: src/test/org/apache/lucene/index/TestDocumentWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 684150) +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) @@ -17,21 +17,27 @@ * limitations under the License. */ -import org.apache.lucene.analysis.*; +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; -import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; -import java.io.IOException; -import java.io.Reader; - public class TestDocumentWriter extends LuceneTestCase { private RAMDirectory dir; @@ -134,10 +140,6 @@ boolean first=true; Token buffered; - public Token next() throws IOException { - return input.next(); - } - public Token next(Token result) throws IOException { if (buffered != null) { Token t = buffered; @@ -199,11 +201,11 @@ private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; private int index = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (index == tokens.length) { return null; } else { - return new Token(tokens[index++], 0, 0); + return token.reinit(tokens[index++], 0, 0); } } Index: src/test/org/apache/lucene/index/TestPayloads.java =================================================================== --- src/test/org/apache/lucene/index/TestPayloads.java (revision 684150) +++ src/test/org/apache/lucene/index/TestPayloads.java (working copy) @@ -536,11 +536,11 @@ first = true; } - public Token next() throws IOException { - if (!first) return null; - Token t = new Token(term, 0, 0); - t.setPayload(new Payload(payload)); - return t; + public Token next(Token token) throws IOException { + if (!first) return null; + token.reinit(term, 0, 0); + token.setPayload(new Payload(payload)); + return token; } public void close() throws IOException { Index: src/java/org/apache/lucene/queryParser/TokenMgrError.java =================================================================== --- src/java/org/apache/lucene/queryParser/TokenMgrError.java (revision 684150) +++ src/java/org/apache/lucene/queryParser/TokenMgrError.java (working copy) @@ -72,7 +72,7 @@ default: if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u").append(s.substring(s.length() - 4, s.length())); + retval.append("\\u" + s.substring(s.length() - 4, s.length())); } else { retval.append(ch); } Index: src/java/org/apache/lucene/queryParser/QueryParser.java =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 684150) +++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy) @@ -1,14 +1,35 @@ /* Generated By:JavaCC: Do not edit this line. QueryParser.java */ package org.apache.lucene.queryParser; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateField; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; /** @@ -451,20 +472,21 @@ TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); + final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { - t = source.next(); + t = source.next(reusableToken); } catch (IOException e) { t = null; } if (t == null) break; - v.addElement(t); + v.addElement(t.clone()); if (t.getPositionIncrement() != 0) positionCount += t.getPositionIncrement(); else @@ -481,7 +503,7 @@ return null; else if (v.size() == 1) { t = (org.apache.lucene.analysis.Token) v.elementAt(0); - return new TermQuery(new Term(field, t.termText())); + return new TermQuery(new Term(field, t.term())); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { @@ -490,7 +512,7 @@ for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); TermQuery currentQuery = new TermQuery( - new Term(field, t.termText())); + new Term(field, t.term())); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; @@ -512,7 +534,7 @@ multiTerms.clear(); } position += t.getPositionIncrement(); - multiTerms.add(new Term(field, t.termText())); + multiTerms.add(new Term(field, t.term())); } if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); @@ -530,9 +552,9 @@ t = (org.apache.lucene.analysis.Token) v.elementAt(i); if (enablePositionIncrements) { position += t.getPositionIncrement(); - pq.add(new Term(field, t.termText()),position); + pq.add(new Term(field, t.term()),position); } else { - pq.add(new Term(field, t.termText())); + pq.add(new Term(field, t.term())); } } return pq; @@ -1490,6 +1512,9 @@ public ParseException generateParseException() { jj_expentries.removeAllElements(); boolean[] la1tokens = new boolean[34]; + for (int i = 0; i < 34; i++) { + la1tokens[i] = false; + } if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; Index: src/java/org/apache/lucene/queryParser/QueryParser.jj =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 684150) +++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy) @@ -25,14 +25,35 @@ package org.apache.lucene.queryParser; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateField; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; /** @@ -475,20 +496,21 @@ TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); + final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { - t = source.next(); + t = source.next(reusableToken); } catch (IOException e) { t = null; } if (t == null) break; - v.addElement(t); + v.addElement(t.clone()); if (t.getPositionIncrement() != 0) positionCount += t.getPositionIncrement(); else @@ -505,7 +527,7 @@ return null; else if (v.size() == 1) { t = (org.apache.lucene.analysis.Token) v.elementAt(0); - return new TermQuery(new Term(field, t.termText())); + return new TermQuery(new Term(field, t.term())); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { @@ -514,7 +536,7 @@ for (int i = 0; i < v.size(); i++) { t = (org.apache.lucene.analysis.Token) v.elementAt(i); TermQuery currentQuery = new TermQuery( - new Term(field, t.termText())); + new Term(field, t.term())); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; @@ -536,7 +558,7 @@ multiTerms.clear(); } position += t.getPositionIncrement(); - multiTerms.add(new Term(field, t.termText())); + multiTerms.add(new Term(field, t.term())); } if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); @@ -554,9 +576,9 @@ t = (org.apache.lucene.analysis.Token) v.elementAt(i); if (enablePositionIncrements) { position += t.getPositionIncrement(); - pq.add(new Term(field, t.termText()),position); + pq.add(new Term(field, t.term()),position); } else { - pq.add(new Term(field, t.termText())); + pq.add(new Term(field, t.term())); } } return pq; Index: src/java/org/apache/lucene/queryParser/CharStream.java =================================================================== --- src/java/org/apache/lucene/queryParser/CharStream.java (revision 684150) +++ src/java/org/apache/lucene/queryParser/CharStream.java (working copy) @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 3.0 */ +/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.0 */ package org.apache.lucene.queryParser; /** Index: src/java/org/apache/lucene/queryParser/ParseException.java =================================================================== --- src/java/org/apache/lucene/queryParser/ParseException.java (revision 684150) +++ src/java/org/apache/lucene/queryParser/ParseException.java (working copy) @@ -98,19 +98,19 @@ if (!specialConstructor) { return super.getMessage(); } - String expected = ""; + StringBuffer expected = new StringBuffer(); int maxSize = 0; for (int i = 0; i < expectedTokenSequences.length; i++) { if (maxSize < expectedTokenSequences[i].length) { maxSize = expectedTokenSequences[i].length; } for (int j = 0; j < expectedTokenSequences[i].length; j++) { - expected += tokenImage[expectedTokenSequences[i][j]] + " "; + expected.append(tokenImage[expectedTokenSequences[i][j]]).append(" "); } if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) { - expected += "..."; + expected.append("..."); } - expected += eol + " "; + expected.append(eol).append(" "); } String retval = "Encountered \""; Token tok = currentToken.next; @@ -130,7 +130,7 @@ } else { retval += "Was expecting one of:" + eol + " "; } - retval += expected; + retval += expected.toString(); return retval; } @@ -179,7 +179,7 @@ default: if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u").append(s.substring(s.length() - 4, s.length())); + retval.append("\\u" + s.substring(s.length() - 4, s.length())); } else { retval.append(ch); } Index: src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java =================================================================== --- src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java (revision 684150) +++ src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java (working copy) @@ -1,13 +1,33 @@ /* Generated By:JavaCC: Do not edit this line. QueryParserTokenManager.java */ package org.apache.lucene.queryParser; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateField; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; public class QueryParserTokenManager implements QueryParserConstants Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 684150) +++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy) @@ -22,11 +22,11 @@ } public SinkTokenizer() { - this.lst = new ArrayList(); + this.lst = new ArrayList/**/(); } public SinkTokenizer(int initCap){ - this.lst = new ArrayList(initCap); + this.lst = new ArrayList/**/(initCap); } /** @@ -35,6 +35,8 @@ * WARNING: Adding tokens to this list requires the {@link #reset()} method to be called in order for them * to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s * in the case of adds happening while {@link #next(org.apache.lucene.analysis.Token)} is being called. + *

+ * WARNING: Since this SinkTokenizer can be reset and the cached tokens made available again, do not modify them. Modify clones instead. * * @return A List of {@link org.apache.lucene.analysis.Token}s */ @@ -47,9 +49,15 @@ * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. * @throws IOException */ - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (iter == null) iter = lst.iterator(); - return iter.hasNext() ? (Token) iter.next() : null; + // Since this TokenStream can be reset we have to maintain the tokens as immutable + if (iter.hasNext()) { + token = (Token) iter.next(); + return (Token) token.clone(); + } + return null; } Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy) @@ -40,11 +40,12 @@ super(input); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (cache == null) { // fill cache lazily cache = new LinkedList(); - fillCache(); + fillCache(token); iterator = cache.iterator(); } @@ -52,8 +53,9 @@ // the cache is exhausted, return null return null; } - - return (Token) iterator.next(); + // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. + Token t = (Token) iterator.next(); + return (Token) t.clone(); } public void reset() throws IOException { @@ -62,10 +64,9 @@ } } - private void fillCache() throws IOException { - Token token; - while ( (token = input.next()) != null) { - cache.add(token); + private void fillCache(final Token reusableToken) throws IOException { + for (Token token = input.next(reusableToken); token != null; token = input.next(reusableToken)) { + cache.add(token.clone()); } } Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 684150) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -45,6 +45,7 @@ } public final Token next(Token token) throws IOException { + assert token != null; token.clear(); int length = 0; int start = bufferIndex; @@ -81,9 +82,9 @@ break; // return 'em } - token.termLength = length; - token.startOffset = start; - token.endOffset = start+length; + token.setTermLength(length); + token.setStartOffset(start); + token.setEndOffset(start+length); return token; } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 684150) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -24,8 +24,9 @@

This is an abstract class.

- NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + NOTE: subclasses must override {@link #next(Token)}. It's + also OK to instead override {@link #next()} but that + method is now deprecated in favor of {@link #next(Token)}.

NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy) @@ -46,9 +46,10 @@ } public final Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null) { - if (stemmer.stem(result.termBuffer(), 0, result.termLength)) + if (stemmer.stem(result.termBuffer(), 0, result.termLength())) result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); return result; } else Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 684150) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -39,6 +39,7 @@ } public Token next(Token result) throws IOException { + assert result != null; if (!done) { done = true; int upto = 0; @@ -51,7 +52,7 @@ if (upto == buffer.length) buffer = result.resizeTermBuffer(1+buffer.length); } - result.termLength = upto; + result.setTermLength(upto); return result; } return null; Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy) @@ -39,6 +39,7 @@ *

Removes dots from acronyms. */ public final Token next(Token result) throws java.io.IOException { + assert result != null; Token t = input.next(result); if (t == null) Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 684150) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -133,6 +133,7 @@ * @see org.apache.lucene.analysis.TokenStream#next() */ public Token next(Token result) throws IOException { + assert result != null; int posIncr = 1; while(true) { Index: src/java/org/apache/lucene/analysis/Token.java =================================================================== --- src/java/org/apache/lucene/analysis/Token.java (revision 684150) +++ src/java/org/apache/lucene/analysis/Token.java (working copy) @@ -19,8 +19,9 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc +import org.apache.lucene.util.ArrayUtil; -/** A Token is an occurence of a term from the text of a field. It consists of +/** A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.

@@ -29,7 +30,7 @@ browser, or to show matching text fragments in a KWIC (KeyWord In Context) display, etc.

- The type is an interned string, assigned by a lexical analyzer + The type is a string, assigned by a lexical analyzer (a.k.a. tokenizer), naming the lexical or syntactic class that the token belongs to. For example an end of sentence marker token might be implemented with type "eos". The default token type is "word". @@ -49,7 +50,7 @@

NOTE: As of 2.3, Token stores the term text internally as a malleable char[] termBuffer instead of String termText. The indexing code and core tokenizers - have been changed re-use a single Token instance, changing + have been changed to re-use a single Token instance, changing its buffer and other fields in-place as the Token is processed. This provides substantially better indexing performance as it saves the GC cost of new'ing a Token and @@ -62,14 +63,55 @@ instance when possible for best performance, by implementing the {@link TokenStream#next(Token)} API. Failing that, to create a new Token you should first use - one of the constructors that starts with null text. Then - you should call either {@link #termBuffer()} or {@link - #resizeTermBuffer(int)} to retrieve the Token's - termBuffer. Fill in the characters of your term into this - buffer, and finally call {@link #setTermLength(int)} to + one of the constructors that starts with null text. To load + the token from a char[] use {@link #setTermBuffer(char[], int, int)}. + To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}. + Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, + if you know that your text is shorter than the capacity of the termBuffer + or {@link #resizeTermBuffer(int)}, if there is any possibility + that you may need to grow the buffer. Fill in the characters of your term into this + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to set the length of the term text. See LUCENE-969 for details.

+

Typical reuse patterns: +

    +
  • Copying text from a string (type is reset to #DEFAULT_TYPE if not specified):
    +
    +    return reusableToken.reinit(string, startOffset, endOffset[, type]);
    +  
    +
  • +
  • Copying some text from a string (type is reset to #DEFAULT_TYPE if not specified):
    +
    +    return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
    +  
    +
  • + +
  • Copying text from char[] buffer (type is reset to #DEFAULT_TYPE if not specified):
    +
    +    return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
    +  
    +
  • +
  • Copying some text from a char[] buffer (type is reset to #DEFAULT_TYPE if not specified):
    +
    +    return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
    +  
    +
  • +
  • Copying from one one Token to another (type is reset to #DEFAULT_TYPE if not specified):
    +
    +    return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
    +  
    +
  • +
+ A couple of things to note: +
    +
  • clear() initializes most of the fields to default values, but not startOffset, endOffset and type.
  • +
  • Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
  • +
  • The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.
  • +
  • When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
  • +
+

@see org.apache.lucene.index.Payload */ @@ -83,16 +125,56 @@ * deprecated APIs */ private String termText; - char[] termBuffer; // characters for the term text - int termLength; // length of term text in buffer + /** + * Characters for the term text. + * @deprecated This will be made private. Instead, use: + * {@link termBuffer()}, + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + */ + char[] termBuffer; - int startOffset; // start in source text - int endOffset; // end in source text - String type = DEFAULT_TYPE; // lexical type + /** + * Length of term text in the buffer. + * @deprecated This will be made private. Instead, use: + * {@link termLength()}, or @{link setTermLength(int)}. + */ + int termLength; + + /** + * Start in source text. + * @deprecated This will be made private. Instead, use: + * {@link startOffset()}, or @{link setStartOffset(int)}. + */ + int startOffset; + + /** + * End in source text. + * @deprecated This will be made private. Instead, use: + * {@link endOffset()}, or @{link setEndOffset(int)}. + */ + int endOffset; + + /** + * The lexical type of the token. + * @deprecated This will be made private. Instead, use: + * {@link type()}, or @{link setType(String)}. + */ + String type = DEFAULT_TYPE; + private int flags; + /** + * @deprecated This will be made private. Instead, use: + * {@link getPayload()}, or @{link setPayload(Payload)}. + */ Payload payload; + /** + * @deprecated This will be made private. Instead, use: + * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}. + */ int positionIncrement = 1; /** Constructs a Token will null text. */ @@ -101,8 +183,8 @@ /** Constructs a Token with null text and start & end * offsets. - * @param start start offset - * @param end end offset */ + * @param start start offset in the source text + * @param end end offset in the source text */ public Token(int start, int end) { startOffset = start; endOffset = end; @@ -110,8 +192,9 @@ /** Constructs a Token with null text and start & end * offsets plus the Token type. - * @param start start offset - * @param end end offset */ + * @param start start offset in the source text + * @param end end offset in the source text + * @param type the lexical type of this Token */ public Token(int start, int end, String typ) { startOffset = start; endOffset = end; @@ -120,12 +203,12 @@ /** * Constructs a Token with null text and start & end - * offsets plus the Token type. - * @param start start offset - * @param end end offset - * @param flags The bits to set for this token + * offsets plus flags. NOTE: flags is EXPERIMENTAL. + * @param start start offset in the source text + * @param end end offset in the source text + * @param flags The bits to set for this token */ - public Token(int start, int end, int flags){ + public Token(int start, int end, int flags) { startOffset = start; endOffset = end; this.flags = flags; @@ -138,7 +221,9 @@ * term text. * @param text term text * @param start start offset - * @param end end offset */ + * @param end end offset + * @deprecated + */ public Token(String text, int start, int end) { termText = text; startOffset = start; @@ -152,7 +237,9 @@ * @param text term text * @param start start offset * @param end end offset - * @param typ token type */ + * @param typ token type + * @deprecated + */ public Token(String text, int start, int end, String typ) { termText = text; startOffset = start; @@ -169,6 +256,7 @@ * @param start * @param end * @param flags token type bits + * @deprecated */ public Token(String text, int start, int end, int flags) { termText = text; @@ -177,6 +265,22 @@ this.flags = flags; } + /** + * Constructs a Token with the given term buffer (offset + * & length), start and end + * offsets + * @param termBuffer + * @param termBufferOffset + * @param termBufferLength + * @param start + * @param end + */ + public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { + setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); + startOffset = start; + endOffset = end; + } + /** Set the position increment. This determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. @@ -200,6 +304,7 @@ * occur with no intervening stop words. * * + * @param positionIncrement the distance from the prior term * @see org.apache.lucene.index.TermPositions */ public void setPositionIncrement(int positionIncrement) { @@ -218,7 +323,11 @@ /** Sets the Token's term text. NOTE: for better * indexing speed you should instead use the char[] - * termBuffer methods to set the term text. */ + * termBuffer methods to set the term text. + * @deprecated use {@link #setTermBuffer(char[], int, length)} or + * {@link #setTermBuffer(String)} or + * {@link #setTermBuffer(String, int, int)}. + */ public void setTermText(String text) { termText = text; termBuffer = null; @@ -230,7 +339,7 @@ * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a - * String, use new String(token.termBuffer(), 0, token.termLength()) + * String, use {@link #term()} */ public final String termText() { if (termText == null && termBuffer != null) @@ -238,19 +347,70 @@ return termText; } + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public final String term() { + if (termText != null) + return termText; + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + /** Copies the contents of buffer, starting at offset for - * length characters, into the termBuffer - * array. NOTE: for better indexing speed you - * should instead retrieve the termBuffer, using {@link - * #termBuffer()} or {@link #resizeTermBuffer(int)}, and - * fill it in directly to set the term text. This saves - * an extra copy. */ + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ public final void setTermBuffer(char[] buffer, int offset, int length) { - resizeTermBuffer(length); + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } System.arraycopy(buffer, offset, termBuffer, 0, length); termLength = length; } + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public final void setTermBuffer(String buffer) { + termText = null; + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public final void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + termText = null; + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link @@ -263,23 +423,69 @@ return termBuffer; } - /** Grows the termBuffer to at least size newSize. + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize) { - initTermBuffer(); - if (newSize > termBuffer.length) { - int size = termBuffer.length; - while(size < newSize) - size *= 2; - char[] newBuffer = new char[size]; - System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); - termBuffer = newBuffer; + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + assert newCharBuffer != null; + if (termText != null) { + termText.getChars(0, termText.length(), newCharBuffer, 0); + } + termBuffer = newCharBuffer; + } else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; } + termText = null; return termBuffer; } + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) { + if (termBuffer != null) { + if (termBuffer.length >= newSize) + // Already big enough + return null; + else + // Not big enough; create a new array with slight + // over allocation: + return new char[ArrayUtil.getNextSize(newSize)]; + } else { + + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + // If there is already a termText, then the size has to be at least that big + if (termText != null) { + int ttLength = termText.length(); + if (newSize < ttLength) { + newSize = ttLength; + } + } + + return new char[newSize]; + } + } + // TODO: once we remove the deprecated termText() method // and switch entirely to char[] termBuffer we don't need // to use this method anymore @@ -308,9 +514,16 @@ } /** Set number of valid characters (length of the term) in - * the termBuffer array. */ + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ public final void setTermLength(int length) { initTermBuffer(); + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); termLength = length; } @@ -331,7 +544,8 @@ } /** Returns this Token's ending offset, one greater than the position of the - last character corresponding to this token in the source text. */ + last character corresponding to this token in the source text. The length + of the token in the source text is (endOffset - startOffset). */ public final int endOffset() { return endOffset; } @@ -374,8 +588,6 @@ this.flags = flags; } - - /** * Returns this Token's payload. */ @@ -424,9 +636,9 @@ public Object clone() { try { Token t = (Token)super.clone(); + // Do a deep clone if (termBuffer != null) { - t.termBuffer = null; - t.setTermBuffer(termBuffer, 0, termLength); + t.termBuffer = (char[]) termBuffer.clone(); } if (payload != null) { t.setPayload((Payload) payload.clone()); @@ -436,4 +648,168 @@ throw new RuntimeException(e); // shouldn't happen } } + + /** Makes a clone, but replaces the term buffer & + * start/end offset in the process. This is more + * efficient than doing a full clone (and then calling + * setTermBuffer) because it saves a wasted copy of the old + * termBuffer. */ + public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { + final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); + t.positionIncrement = positionIncrement; + t.flags = flags; + t.type = type; + if (payload != null) + t.payload = (Payload) payload.clone(); + return t; + } + + public boolean equals(Object obj) { + if (obj == this) + return true; + + if (obj instanceof Token) { + Token other = (Token) obj; + + initTermBuffer(); + other.initTermBuffer(); + + if (termLength == other.termLength && + startOffset == other.startOffset && + endOffset == other.endOffset && + flags == other.flags && + positionIncrement == other.positionIncrement && + subEqual(type, other.type) && + subEqual(payload, other.payload)) { + for(int i=0;i This is an abstract class. - NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + NOTE: subclasses must override {@link #next(Token)}. It's + also OK to instead override {@link #next()} but that + method is now deprecated in favor of {@link #next(Token)}. */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ Index: src/java/org/apache/lucene/analysis/LengthFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy) @@ -42,10 +42,11 @@ } /** - * Returns the next input Token whose termText() is the right len + * Returns the next input Token whose term() is the right len */ public final Token next(Token result) throws IOException { + assert result != null; // return the first non-stop word found for (Token token = input.next(result); token != null; token = input.next(result)) { Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy) @@ -33,6 +33,7 @@ private int outputPos; public final Token next(Token result) throws java.io.IOException { + assert result != null; result = input.next(result); if (result != null) { final char[] buffer = result.termBuffer(); Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 684150) +++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy) @@ -30,11 +30,12 @@ } public final Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null) { final char[] buffer = result.termBuffer(); - final int length = result.termLength; + final int length = result.termLength(); for(int i=0;i{@link TokenFilter}, a TokenStream whose input is another TokenStream. - NOTE: subclasses must override at least one of {@link - #next()} or {@link #next(Token)}. + NOTE: subclasses must override {@link #next(Token)}. It's + also OK to instead override {@link #next()} but that + method is now deprecated in favor of {@link #next(Token)}. */ public abstract class TokenStream { /** Returns the next token in the stream, or null at EOS. - * The returned Token is a "full private copy" (not + * @deprecated The returned Token is a "full private copy" (not * re-used across calls to next()) but will be slower * than calling {@link #next(Token)} instead.. */ public Token next() throws IOException { @@ -71,11 +72,20 @@ *
  • A producer must call {@link Token#clear()} * before setting the fields in it & returning it
  • * + * Also, the producer must make no assumptions about a + * Token after it has been returned: the caller may + * arbitrarily change it. If the producer needs to hold + * onto the token for subsequent calls, it must clone() + * it before storing it. * Note that a {@link TokenFilter} is considered a consumer. - * @param result a Token that may or may not be used to return + * @param result a Token that may or may not be used to + * return; this parameter should never be null (the callee + * is not required to check for null before using it) * @return next token in the stream or null if end-of-stream was hit */ public Token next(Token result) throws IOException { + // We don't actually use result, but still add this assert + assert result != null; return next(); } @@ -84,7 +94,12 @@ * implement this method. Reset() is not needed for * the standard indexing process. However, if the Tokens * of a TokenStream are intended to be consumed more than - * once, it is necessary to implement reset(). + * once, it is necessary to implement reset(). Note that + * if your TokenStream caches tokens and feeds them back + * again after a reset, it is imperative that you + * clone the tokens when you store them away (on the + * first pass) as well as when you return them (on future + * passes after reset()). */ public void reset() throws IOException {} Index: src/java/org/apache/lucene/search/QueryTermVector.java =================================================================== --- src/java/org/apache/lucene/search/QueryTermVector.java (revision 684150) +++ src/java/org/apache/lucene/search/QueryTermVector.java (working copy) @@ -17,15 +17,20 @@ * limitations under the License. */ +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.TermFreqVector; -import java.io.IOException; -import java.io.StringReader; -import java.util.*; - /** * * @@ -51,12 +56,11 @@ TokenStream stream = analyzer.tokenStream("", new StringReader(queryString)); if (stream != null) { - Token next = null; List terms = new ArrayList(); try { - while ((next = stream.next()) != null) - { - terms.add(next.termText()); + final Token reusableToken = new Token(); + for (Token next = stream.next(reusableToken); next != null; next = stream.next(reusableToken)) { + terms.add(next.term()); } processTerms((String[])terms.toArray(new String[terms.size()])); } catch (IOException e) { Index: src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 684150) +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -79,15 +79,7 @@ if (!field.isTokenized()) { // un-tokenized field String stringValue = field.stringValue(); final int valueLength = stringValue.length(); - Token token = perThread.localToken; - token.clear(); - char[] termBuffer = token.termBuffer(); - if (termBuffer.length < valueLength) - termBuffer = token.resizeTermBuffer(valueLength); - stringValue.getChars(0, valueLength, termBuffer, 0); - token.setTermLength(valueLength); - token.setStartOffset(fieldState.offset); - token.setEndOffset(fieldState.offset + stringValue.length()); + Token token = perThread.localToken.reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); boolean success = false; try { consumer.add(token); @@ -96,7 +88,7 @@ if (!success) docState.docWriter.setAborting(); } - fieldState.offset += stringValue.length(); + fieldState.offset += valueLength; fieldState.length++; fieldState.position++; } else { // tokenized field Index: src/java/org/apache/lucene/index/Payload.java =================================================================== --- src/java/org/apache/lucene/index/Payload.java (revision 684150) +++ src/java/org/apache/lucene/index/Payload.java (working copy) @@ -21,143 +21,164 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.ArrayUtil; - /** - * A Payload is metadata that can be stored together with each occurrence - * of a term. This metadata is stored inline in the posting list of the - * specific term. - *

    - * To store payloads in the index a {@link TokenStream} has to be used that - * produces {@link Token}s containing payload data. - *

    - * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} - * to retrieve the payloads from the index.
    - * - */ - public class Payload implements Serializable, Cloneable { - /** the byte array containing the payload data */ - protected byte[] data; +/** + * A Payload is metadata that can be stored together with each occurrence + * of a term. This metadata is stored inline in the posting list of the + * specific term. + *

    + * To store payloads in the index a {@link TokenStream} has to be used that + * produces {@link Token}s containing payload data. + *

    + * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} + * to retrieve the payloads from the index.
    + * + */ +public class Payload implements Serializable, Cloneable { + /** the byte array containing the payload data */ + protected byte[] data; - /** the offset within the byte array */ - protected int offset; + /** the offset within the byte array */ + protected int offset; - /** the length of the payload data */ - protected int length; + /** the length of the payload data */ + protected int length; - /** Creates an empty payload and does not allocate a byte array. */ - public Payload() { - // nothing to do - } + /** Creates an empty payload and does not allocate a byte array. */ + public Payload() { + // nothing to do + } - /** - * Creates a new payload with the the given array as data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - * - * @param data the data of this payload - */ - public Payload(byte[] data) { - this(data, 0, data.length); - } + /** + * Creates a new payload with the the given array as data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + * + * @param data the data of this payload + */ + public Payload(byte[] data) { + this(data, 0, data.length); + } - /** - * Creates a new payload with the the given array as data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - * - * @param data the data of this payload - * @param offset the offset in the data byte array - * @param length the length of the data - */ - public Payload(byte[] data, int offset, int length) { - if (offset < 0 || offset + length > data.length) { - throw new IllegalArgumentException(); - } - this.data = data; - this.offset = offset; - this.length = length; + /** + * Creates a new payload with the the given array as data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + * + * @param data the data of this payload + * @param offset the offset in the data byte array + * @param length the length of the data + */ + public Payload(byte[] data, int offset, int length) { + if (offset < 0 || offset + length > data.length) { + throw new IllegalArgumentException(); } + this.data = data; + this.offset = offset; + this.length = length; + } - /** - * Sets this payloads data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - */ - public void setData(byte[] data) { - setData(data, 0, data.length); - } + /** + * Sets this payloads data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + */ + public void setData(byte[] data) { + setData(data, 0, data.length); + } - /** - * Sets this payloads data. - * A reference to the passed-in array is held, i. e. no - * copy is made. - */ - public void setData(byte[] data, int offset, int length) { - this.data = data; - this.offset = offset; - this.length = length; - } + /** + * Sets this payloads data. + * A reference to the passed-in array is held, i. e. no + * copy is made. + */ + public void setData(byte[] data, int offset, int length) { + this.data = data; + this.offset = offset; + this.length = length; + } - /** - * Returns a reference to the underlying byte array - * that holds this payloads data. - */ - public byte[] getData() { - return this.data; - } + /** + * Returns a reference to the underlying byte array + * that holds this payloads data. + */ + public byte[] getData() { + return this.data; + } - /** - * Returns the offset in the underlying byte array - */ - public int getOffset() { - return this.offset; - } + /** + * Returns the offset in the underlying byte array + */ + public int getOffset() { + return this.offset; + } - /** - * Returns the length of the payload data. - */ - public int length() { - return this.length; - } + /** + * Returns the length of the payload data. + */ + public int length() { + return this.length; + } - /** - * Returns the byte at the given index. - */ - public byte byteAt(int index) { - if (0 <= index && index < this.length) { - return this.data[this.offset + index]; - } - throw new ArrayIndexOutOfBoundsException(index); + /** + * Returns the byte at the given index. + */ + public byte byteAt(int index) { + if (0 <= index && index < this.length) { + return this.data[this.offset + index]; } + throw new ArrayIndexOutOfBoundsException(index); + } - /** - * Allocates a new byte array, copies the payload data into it and returns it. - */ - public byte[] toByteArray() { - byte[] retArray = new byte[this.length]; - System.arraycopy(this.data, this.offset, retArray, 0, this.length); - return retArray; - } + /** + * Allocates a new byte array, copies the payload data into it and returns it. + */ + public byte[] toByteArray() { + byte[] retArray = new byte[this.length]; + System.arraycopy(this.data, this.offset, retArray, 0, this.length); + return retArray; + } - /** - * Copies the payload data to a byte array. - * - * @param target the target byte array - * @param targetOffset the offset in the target byte array - */ - public void copyTo(byte[] target, int targetOffset) { - if (this.length > target.length + targetOffset) { - throw new ArrayIndexOutOfBoundsException(); - } - System.arraycopy(this.data, this.offset, target, targetOffset, this.length); + /** + * Copies the payload data to a byte array. + * + * @param target the target byte array + * @param targetOffset the offset in the target byte array + */ + public void copyTo(byte[] target, int targetOffset) { + if (this.length > target.length + targetOffset) { + throw new ArrayIndexOutOfBoundsException(); } + System.arraycopy(this.data, this.offset, target, targetOffset, this.length); + } - /** - * Clones this payload by creating a copy of the underlying - * byte array. - */ - public Object clone() { - Payload clone = new Payload(this.toByteArray()); - return clone; - } + /** + * Clones this payload by creating a copy of the underlying + * byte array. + */ + public Object clone() { + Payload clone = new Payload(this.toByteArray()); + return clone; + } + + public boolean equals(Object obj) { + if (obj == this) + return true; + if (obj instanceof Payload) { + Payload other = (Payload) obj; + if (length == other.length) { + for(int i=0;i=start;i--) + code = code*31 + array[i]; + return code; + } + + /** Returns hash of chars in range start (inclusive) to + * end (inclusive) */ + public static int hashCode(byte[] array, int start, int end) { + int code = 0; + for(int i=end-1;i>=start;i--) + code = code*31 + array[i]; + return code; + } } Index: src/demo/org/apache/lucene/demo/html/TokenMgrError.java =================================================================== --- src/demo/org/apache/lucene/demo/html/TokenMgrError.java (revision 684150) +++ src/demo/org/apache/lucene/demo/html/TokenMgrError.java (working copy) @@ -72,7 +72,7 @@ default: if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u").append(s.substring(s.length() - 4, s.length())); + retval.append("\\u" + s.substring(s.length() - 4, s.length())); } else { retval.append(ch); } Index: src/demo/org/apache/lucene/demo/html/HTMLParser.java =================================================================== --- src/demo/org/apache/lucene/demo/html/HTMLParser.java (revision 684150) +++ src/demo/org/apache/lucene/demo/html/HTMLParser.java (working copy) @@ -487,7 +487,10 @@ private int jj_gc = 0; public HTMLParser(java.io.InputStream stream) { - jj_input_stream = new SimpleCharStream(stream, 1, 1); + this(stream, null); + } + public HTMLParser(java.io.InputStream stream, String encoding) { + try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); } token_source = new HTMLParserTokenManager(jj_input_stream); token = new Token(); jj_ntk = -1; @@ -497,7 +500,10 @@ } public void ReInit(java.io.InputStream stream) { - jj_input_stream.ReInit(stream, 1, 1); + ReInit(stream, null); + } + public void ReInit(java.io.InputStream stream, String encoding) { + try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); } token_source.ReInit(jj_input_stream); token = new Token(); jj_ntk = -1; @@ -627,7 +633,9 @@ jj_lasttokens[jj_endpos++] = kind; } else if (jj_endpos != 0) { jj_expentry = new int[jj_endpos]; - System.arraycopy(jj_lasttokens, 0, jj_expentry, 0, jj_endpos); + for (int i = 0; i < jj_endpos; i++) { + jj_expentry[i] = jj_lasttokens[i]; + } boolean exists = false; for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) { int[] oldentry = (int[])(e.nextElement()); @@ -692,6 +700,7 @@ final private void jj_rescan_token() { jj_rescan = true; for (int i = 0; i < 2; i++) { + try { JJCalls p = jj_2_rtns[i]; do { if (p.gen > jj_gen) { @@ -703,6 +712,7 @@ } p = p.next; } while (p != null); + } catch(LookaheadSuccess ls) { } } jj_rescan = false; } Index: src/demo/org/apache/lucene/demo/html/SimpleCharStream.java =================================================================== --- src/demo/org/apache/lucene/demo/html/SimpleCharStream.java (revision 684150) +++ src/demo/org/apache/lucene/demo/html/SimpleCharStream.java (working copy) @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 3.0 */ +/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.0 */ package org.apache.lucene.demo.html; /** @@ -27,7 +27,12 @@ protected char[] buffer; protected int maxNextCharInd = 0; protected int inBuf = 0; + protected int tabSize = 8; + protected void setTabSize(int i) { tabSize = i; } + protected int getTabSize(int i) { return tabSize; } + + protected void ExpandBuff(boolean wrapAround) { char[] newbuffer = new char[bufsize + 2048]; @@ -162,7 +167,7 @@ break; case '\t' : column--; - column += (8 - (column & 07)); + column += (tabSize - (column % tabSize)); break; default : break; @@ -248,7 +253,7 @@ } public SimpleCharStream(java.io.Reader dstream, int startline, - int startcolumn) + int startcolumn) { this(dstream, startline, startcolumn, 4096); } @@ -277,7 +282,7 @@ } public void ReInit(java.io.Reader dstream, int startline, - int startcolumn) + int startcolumn) { ReInit(dstream, startline, startcolumn, 4096); } @@ -286,35 +291,68 @@ { ReInit(dstream, 1, 1, 4096); } + public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline, + int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException + { + this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize); + } + public SimpleCharStream(java.io.InputStream dstream, int startline, int startcolumn, int buffersize) { - this(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096); + this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize); } + public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline, + int startcolumn) throws java.io.UnsupportedEncodingException + { + this(dstream, encoding, startline, startcolumn, 4096); + } + public SimpleCharStream(java.io.InputStream dstream, int startline, - int startcolumn) + int startcolumn) { this(dstream, startline, startcolumn, 4096); } + public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException + { + this(dstream, encoding, 1, 1, 4096); + } + public SimpleCharStream(java.io.InputStream dstream) { this(dstream, 1, 1, 4096); } + public void ReInit(java.io.InputStream dstream, String encoding, int startline, + int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException + { + ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize); + } + public void ReInit(java.io.InputStream dstream, int startline, int startcolumn, int buffersize) { - ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096); + ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize); } + public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException + { + ReInit(dstream, encoding, 1, 1, 4096); + } + public void ReInit(java.io.InputStream dstream) { ReInit(dstream, 1, 1, 4096); } + public void ReInit(java.io.InputStream dstream, String encoding, int startline, + int startcolumn) throws java.io.UnsupportedEncodingException + { + ReInit(dstream, encoding, startline, startcolumn, 4096); + } public void ReInit(java.io.InputStream dstream, int startline, - int startcolumn) + int startcolumn) { ReInit(dstream, startline, startcolumn, 4096); } Index: src/demo/org/apache/lucene/demo/html/ParseException.java =================================================================== --- src/demo/org/apache/lucene/demo/html/ParseException.java (revision 684150) +++ src/demo/org/apache/lucene/demo/html/ParseException.java (working copy) @@ -98,19 +98,19 @@ if (!specialConstructor) { return super.getMessage(); } - String expected = ""; + StringBuffer expected = new StringBuffer(); int maxSize = 0; for (int i = 0; i < expectedTokenSequences.length; i++) { if (maxSize < expectedTokenSequences[i].length) { maxSize = expectedTokenSequences[i].length; } for (int j = 0; j < expectedTokenSequences[i].length; j++) { - expected += tokenImage[expectedTokenSequences[i][j]] + " "; + expected.append(tokenImage[expectedTokenSequences[i][j]]).append(" "); } if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) { - expected += "..."; + expected.append("..."); } - expected += eol + " "; + expected.append(eol).append(" "); } String retval = "Encountered \""; Token tok = currentToken.next; @@ -130,7 +130,7 @@ } else { retval += "Was expecting one of:" + eol + " "; } - retval += expected; + retval += expected.toString(); return retval; } @@ -179,7 +179,7 @@ default: if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u").append(s.substring(s.length() - 4, s.length())); + retval.append("\\u" + s.substring(s.length() - 4, s.length())); } else { retval.append(ch); } Index: src/demo/org/apache/lucene/demo/html/HTMLParserTokenManager.java =================================================================== --- src/demo/org/apache/lucene/demo/html/HTMLParserTokenManager.java (revision 684150) +++ src/demo/org/apache/lucene/demo/html/HTMLParserTokenManager.java (working copy) @@ -1457,14 +1457,12 @@ private final int[] jjrounds = new int[28]; private final int[] jjstateSet = new int[56]; protected char curChar; -public HTMLParserTokenManager(SimpleCharStream stream) -{ +public HTMLParserTokenManager(SimpleCharStream stream){ if (SimpleCharStream.staticFlag) throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer."); input_stream = stream; } -public HTMLParserTokenManager(SimpleCharStream stream, int lexState) -{ +public HTMLParserTokenManager(SimpleCharStream stream, int lexState){ this(stream); SwitchTo(lexState); } Index: contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java =================================================================== --- contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 684150) +++ contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy) @@ -1,64 +1,30 @@ package org.apache.lucene.analysis.snowball; -/* ==================================================================== - * The Apache Software License, Version 1.1 +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * Copyright (c) 2004 The Apache Software Foundation. All rights - * reserved. + * http://www.apache.org/licenses/LICENSE-2.0 * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -import java.io.*; +import java.io.StringReader; -import junit.framework.*; +import junit.framework.TestCase; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.Payload; +import org.apache.lucene.analysis.TokenStream; public class TestSnowball extends TestCase { @@ -66,12 +32,12 @@ String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); for (int i = 0; i < output.length; i++) { - Token t = ts.next(); - assertNotNull(t); - assertEquals(output[i], t.termText()); + Token t = ts.next(reusableToken); + assertEquals(output[i], t.term()); } - assertNull(ts.next()); + assertNull(ts.next(reusableToken)); ts.close(); } @@ -83,25 +49,32 @@ public void testFilterTokens() throws Exception { - final Token tok = new Token("accents", 2, 7, "wrd"); + final Token tok = new Token(2, 7, "wrd"); + tok.setTermBuffer("accents"); tok.setPositionIncrement(3); + Payload tokPayload = new Payload(new byte[]{0,1,2,3}); + tok.setPayload(tokPayload); + int tokFlags = 77; + tok.setFlags(tokFlags); SnowballFilter filter = new SnowballFilter( new TokenStream() { - public Token next() { + public Token next(Token token) { + assert token != null; return tok; } }, "English" ); - Token newtok = filter.next(); + Token newtok = filter.next(new Token()); - assertEquals("accent", newtok.termText()); + assertEquals("accent", newtok.term()); assertEquals(2, newtok.startOffset()); assertEquals(7, newtok.endOffset()); assertEquals("wrd", newtok.type()); assertEquals(3, newtok.getPositionIncrement()); + assertEquals(tokFlags, newtok.getFlags()); + assertEquals(tokPayload, newtok.getPayload()); } -} - +} \ No newline at end of file Index: contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java =================================================================== --- contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 684150) +++ contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy) @@ -18,11 +18,10 @@ */ import java.io.IOException; - import java.lang.reflect.Method; import net.sf.snowball.SnowballProgram; -import net.sf.snowball.ext.*; +import net.sf.snowball.ext.EnglishStemmer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; @@ -60,20 +59,22 @@ } /** Returns the next input Token, after being stemmed */ - public final Token next() throws IOException { - Token token = input.next(); + public final Token next(Token token) throws IOException { + assert token != null; + token = input.next(token); if (token == null) return null; - stemmer.setCurrent(token.termText()); + String originalTerm = token.term(); + stemmer.setCurrent(originalTerm); try { stemMethod.invoke(stemmer, EMPTY_ARGS); } catch (Exception e) { throw new RuntimeException(e.toString()); } - - Token newToken = new Token(stemmer.getCurrent(), - token.startOffset(), token.endOffset(), token.type()); - newToken.setPositionIncrement(token.getPositionIncrement()); - return newToken; + String finalTerm = stemmer.getCurrent(); + // Don't bother updating, if it is unchanged. + if (!originalTerm.equals(finalTerm)) + token.setTermBuffer(finalTerm); + return token; } } Index: contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java =================================================================== --- contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java (revision 684150) +++ contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java (working copy) @@ -17,15 +17,29 @@ * limitations under the License. */ -import org.apache.lucene.store.*; -import org.apache.lucene.search.*; -import org.apache.lucene.index.*; -import org.apache.lucene.document.*; -import org.apache.lucene.analysis.*; -import java.io.*; -import java.util.*; +import java.io.IOException; +import java.io.StringReader; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.FSDirectory; + /** * Test program to look up synonyms. */ @@ -86,10 +100,9 @@ // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); - org.apache.lucene.analysis.Token t; - while ( (t = ts.next()) != null) - { - String word = t.termText(); + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + String word = token.term(); if ( already.add( word)) top.add( word); } Index: contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java =================================================================== --- contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java (revision 684150) +++ contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java (working copy) @@ -17,16 +17,30 @@ * limitations under the License. */ -import org.apache.lucene.store.*; -import org.apache.lucene.search.*; -import org.apache.lucene.index.*; -import org.apache.lucene.document.*; -import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.standard.*; -import java.io.*; -import java.util.*; +import java.io.IOException; +import java.io.StringReader; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.FSDirectory; + /** * Expand a query by looking up synonyms for every term. * You need to invoke {@link Syns2Index} first to build the synonym index. @@ -99,10 +113,10 @@ // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); - org.apache.lucene.analysis.Token t; - while ( (t = ts.next()) != null) - { - String word = t.termText(); + + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + String word = token.term(); if ( already.add( word)) top.add( word); } Index: contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java =================================================================== --- contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (revision 684150) +++ contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (working copy) @@ -15,19 +15,32 @@ * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + import junit.framework.TestCase; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.TermPositions; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; -import java.io.IOException; -import java.util.*; - /** * Asserts equality of content and behaviour of two index readers. */ @@ -151,21 +164,24 @@ document.add(f); if (i > 4) { final List tokens = new ArrayList(2); - Token t = new Token("the", 0, 2, "text"); + Token t = createToken("the", 0, 2, "text"); t.setPayload(new Payload(new byte[]{1, 2, 3})); tokens.add(t); - t = new Token("end", 3, 5, "text"); + t = createToken("end", 3, 5, "text"); t.setPayload(new Payload(new byte[]{2})); tokens.add(t); - tokens.add(new Token("fin", 7, 9)); + tokens.add(createToken("fin", 7, 9)); document.add(new Field("f", new TokenStream() { Iterator it = tokens.iterator(); - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (!it.hasNext()) { return null; } - return it.next(); + // Resettable token streams need to return clones. + token = (Token) it.next(); + return (Token) token.clone(); } public void reset() throws IOException { @@ -466,4 +482,19 @@ testReader.close(); } + private static Token createToken(String term, int start, int offset) + { + Token token = new Token(start, offset); + token.setTermBuffer(term); + return token; + } + + private static Token createToken(String term, int start, int offset, String type) + { + Token token = new Token(start, offset, type); + token.setTermBuffer(term); + return token; + } + + } Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java =================================================================== --- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 684150) +++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (working copy) @@ -520,12 +520,13 @@ } else { tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } - Token next = tokenStream.next(); + final Token reusableToken = new Token(); + Token next = tokenStream.next(reusableToken); + while (next != null) { - next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned? - tokens.add(next); // the vector will be built on commit. - next = tokenStream.next(); + tokens.add((Token) next.clone()); // the vector will be built on commit. + next = tokenStream.next(reusableToken); fieldSetting.fieldLength++; if (fieldSetting.fieldLength > maxFieldLength) { break; @@ -533,7 +534,10 @@ } } else { // untokenized - tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized")); + String fieldVal = field.stringValue(); + Token token = new Token(0, fieldVal.length(), "untokenized"); + token.setTermBuffer(fieldVal); + tokens.add(token); fieldSetting.fieldLength++; } } @@ -567,10 +571,10 @@ for (Token token : eField_Tokens.getValue()) { - TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText()); + TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term()); if (termDocumentInformationFactory == null) { termDocumentInformationFactory = new TermDocumentInformationFactory(); - termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory); + termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory); } //termDocumentInformationFactory.termFrequency++; Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java =================================================================== --- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java (revision 684150) +++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java (working copy) @@ -59,20 +59,21 @@ try { - Token token = ts.next(); + final Token reusableToken = new Token(); + Token token = ts.next(reusableToken); Term term = null; while (token != null) { if (term == null) { - term = new Term(fieldName, token.termText()); + term = new Term(fieldName, token.term()); } else { // create from previous to save fieldName.intern overhead - term = term.createTerm(token.termText()); + term = term.createTerm(token.term()); } tf.addTerm(term); - token = ts.next(); + token = ts.next(reusableToken); } } catch (IOException ioe) Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java =================================================================== --- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (revision 684150) +++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (working copy) @@ -74,16 +74,18 @@ if((stopWords!=null)&&(fields!=null)) { stopWordsSet=new HashSet(); + final Token reusableToken = new Token(); + Token stopToken; for (int i = 0; i < fields.length; i++) { TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords)); try { - Token stopToken=ts.next(); + stopToken=ts.next(reusableToken); while(stopToken!=null) { - stopWordsSet.add(stopToken.termText()); - stopToken=ts.next(); + stopWordsSet.add(stopToken.term()); + stopToken=ts.next(reusableToken); } } catch(IOException ioe) Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java =================================================================== --- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java (revision 684150) +++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java (working copy) @@ -52,12 +52,13 @@ { ArrayList clausesList=new ArrayList(); TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value)); - Token token=ts.next(); + final Token reusableToken = new Token(); + Token token = ts.next(reusableToken); while(token!=null) { - SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText())); + SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.term())); clausesList.add(stq); - token=ts.next(); + token=ts.next(reusableToken); } SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()])); soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f)); Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java =================================================================== --- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java (revision 684150) +++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java (working copy) @@ -58,20 +58,21 @@ TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text)); try { - Token token = ts.next(); + final Token reusableToken = new Token(); + Token token = ts.next(reusableToken); Term term = null; while (token != null) { if (term == null) { - term = new Term(fieldName, token.termText()); + term = new Term(fieldName, token.term()); } else { // create from previous to save fieldName.intern overhead - term = term.createTerm(token.termText()); + term = term.createTerm(token.term()); } bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD)); - token = ts.next(); + token = ts.next(reusableToken); } } catch (IOException ioe) Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 684150) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) @@ -1120,21 +1120,22 @@ { lst = new ArrayList(); Token t; - t = new Token("hi", 0, 2); + t = createToken("hi", 0, 2); lst.add(t); - t = new Token("hispeed", 0, 8); + t = createToken("hispeed", 0, 8); lst.add(t); - t = new Token("speed", 3, 8); + t = createToken("speed", 3, 8); t.setPositionIncrement(0); lst.add(t); - t = new Token("10", 8, 10); + t = createToken("10", 8, 10); lst.add(t); - t = new Token("foo", 11, 14); + t = createToken("foo", 11, 14); lst.add(t); iter = lst.iterator(); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; return iter.hasNext() ? (Token) iter.next() : null; } }; @@ -1149,21 +1150,22 @@ { lst = new ArrayList(); Token t; - t = new Token("hispeed", 0, 8); + t = createToken("hispeed", 0, 8); lst.add(t); - t = new Token("hi", 0, 2); + t = createToken("hi", 0, 2); t.setPositionIncrement(0); lst.add(t); - t = new Token("speed", 3, 8); + t = createToken("speed", 3, 8); lst.add(t); - t = new Token("10", 8, 10); + t = createToken("10", 8, 10); lst.add(t); - t = new Token("foo", 11, 14); + t = createToken("foo", 11, 14); lst.add(t); iter = lst.iterator(); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; return iter.hasNext() ? (Token) iter.next() : null; } }; @@ -1346,6 +1348,13 @@ super.tearDown(); } + private static Token createToken(String term, int start, int offset) + { + Token token = new Token(start, offset); + token.setTermBuffer(term); + return token; + } + } // =================================================================== @@ -1392,31 +1401,32 @@ this.synonyms = synonyms; } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (currentRealToken == null) { - Token nextRealToken = realStream.next(); + Token nextRealToken = realStream.next(token); if (nextRealToken == null) { return null; } - String expansions = (String) synonyms.get(nextRealToken.termText()); + String expansions = (String) synonyms.get(nextRealToken.term()); if (expansions == null) { return nextRealToken; } st = new StringTokenizer(expansions, ","); if (st.hasMoreTokens()) { - currentRealToken = nextRealToken; + currentRealToken = (Token) nextRealToken.clone(); } return currentRealToken; } else { - String nextExpandedValue = st.nextToken(); - Token expandedToken = new Token(nextExpandedValue, currentRealToken.startOffset(), - currentRealToken.endOffset()); - expandedToken.setPositionIncrement(0); + token.reinit(st.nextToken(), + currentRealToken.startOffset(), + currentRealToken.endOffset()); + token.setPositionIncrement(0); if (!st.hasMoreTokens()) { currentRealToken = null; st = null; } - return expandedToken; + return token; } } Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (working copy) @@ -121,7 +121,7 @@ */ public float getTokenScore(Token token) { position += token.getPositionIncrement(); - String termText = new String(token.termBuffer(), 0, token.termLength()); + String termText = token.term(); WeightedSpanTerm weightedSpanTerm; Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy) @@ -106,7 +106,7 @@ */ public float getTokenScore(Token token) { - String termText=token.termText(); + String termText=token.term(); WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText); if(queryTerm==null) Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) @@ -147,8 +147,9 @@ { this.tokens=tokens; } - public Token next() + public Token next(Token token) { + assert token != null; if(currentToken>=tokens.length) { return null; @@ -160,6 +161,7 @@ String[] terms=tpv.getTerms(); int[] freq=tpv.getTermFrequencies(); int totalTokens=0; + Token newToken = new Token(); for (int t = 0; t < freq.length; t++) { totalTokens+=freq[t]; @@ -189,9 +191,8 @@ } for (int tp = 0; tp < offsets.length; tp++) { - unsortedTokens.add(new Token(terms[t], - offsets[tp].getStartOffset(), - offsets[tp].getEndOffset())); + newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + unsortedTokens.add(newToken.clone()); } } else @@ -204,9 +205,8 @@ //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.length; tp++) { - tokensInOriginalOrder[pos[tp]]=new Token(terms[t], - offsets[tp].getStartOffset(), - offsets[tp].getEndOffset()); + newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone(); } } } @@ -261,7 +261,7 @@ } return getTokenStream(field, contents, analyzer); } - //conevenience method + //convenience method public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){ return analyzer.tokenStream(field,new StringReader(contents)); } Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (working copy) @@ -62,7 +62,7 @@ return false; } - WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength())); + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term()); if (wSpanTerm != null) { List positionSpans = wSpanTerm.getPositionSpans(); Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy) @@ -61,7 +61,7 @@ tot+=score; } } - tokens[numTokens]=token; + tokens[numTokens]= (Token) token.clone(); scores[numTokens]=score; numTokens++; } Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 684150) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) @@ -22,6 +22,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.PriorityQueue; @@ -217,7 +218,7 @@ try { - org.apache.lucene.analysis.Token token; + final Token reusableToken = new Token(); String tokenText; int startOffset; int endOffset; @@ -225,7 +226,7 @@ textFragmenter.start(text); TokenGroup tokenGroup=new TokenGroup(); - token = tokenStream.next(); + Token token = tokenStream.next(reusableToken); while ((token!= null)&&(token.startOffset()< maxDocCharsToAnalyze)) { if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token))) @@ -261,7 +262,7 @@ // { // break; // } - token = tokenStream.next(); + token = tokenStream.next(reusableToken); } currentFrag.setScore(fragmentScorer.getFragmentScore()); Index: contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java =================================================================== --- contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (revision 685115) +++ contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (working copy) @@ -57,19 +57,26 @@ boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (inPhrase) { inPhrase = false; - return new Token("phrase2", savedStart, savedEnd); + token.setTermBuffer("phrase2"); + token.setStartOffset(savedStart); + token.setEndOffset(savedEnd); + return token; } else - for (Token token = input.next(); token != null; token = input.next()) { - if (token.termText().equals("phrase")) { + for (token = input.next(token); token != null; token = input.next(token)) { + if (token.term().equals("phrase")) { inPhrase = true; savedStart = token.startOffset(); savedEnd = token.endOffset(); - return new Token("phrase1", savedStart, savedEnd); - } else if (!token.termText().equals("stop")) + token.setTermBuffer("phrase1"); + token.setStartOffset(savedStart); + token.setEndOffset(savedEnd); return token; + } else if (!token.term().equals("stop")) + return token; } return null; } Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java (revision 685115) +++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java (working copy) @@ -23,6 +23,7 @@ import java.util.List; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Query; @@ -105,21 +106,23 @@ // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - org.apache.lucene.analysis.Token t; + final Token reusableToken = new Token(); + Token token; int countTokens = 0; while (true) { try { - t = source.next(); + token = source.next(reusableToken); } catch (IOException e) { - t = null; + token = null; } - if (t == null) { + if (token == null) { break; } - if (!"".equals(t.termText())) { + String term = token.term(); + if (!"".equals(term)) { try { - tlist.set(countTokens++, t.termText()); + tlist.set(countTokens++, term); } catch (IndexOutOfBoundsException ioobe) { countTokens = -1; } @@ -189,18 +192,19 @@ // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); List tlist = new ArrayList(); - org.apache.lucene.analysis.Token t; + final Token reusableToken = new Token(); + Token token; while (true) { try { - t = source.next(); + token = source.next(reusableToken); } catch (IOException e) { - t = null; + token = null; } - if (t == null) { + if (token == null) { break; } - tlist.add(t.termText()); + tlist.add(token.term()); } try { @@ -238,14 +242,16 @@ throws ParseException { // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - org.apache.lucene.analysis.Token t; + final Token reusableToken = new Token(); + Token token; + Token multipleToken = new Token(); boolean multipleTokens = false; try { - t = source.next(); - multipleTokens = source.next() != null; + token = source.next(reusableToken); + multipleTokens = source.next(multipleToken) != null; } catch (IOException e) { - t = null; + token = null; } try { @@ -259,7 +265,7 @@ + " - tokens were added"); } - return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity); + return (token == null) ? null : super.getFuzzyQuery(field, token.term(), minSimilarity); } /** @@ -270,18 +276,21 @@ throws ParseException { // get Analyzer from superclass and tokenize the terms TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1)); - org.apache.lucene.analysis.Token t; + final Token reusableToken = new Token(); + final Token reusableMultipleToken = new Token(); + Token token; + Token multipleToken; boolean multipleTokens = false; // part1 try { - t = source.next(); - if (t != null) { - part1 = t.termText(); + token = source.next(reusableToken); + if (token != null) { + part1 = token.term(); } - multipleTokens = source.next() != null; + multipleTokens = source.next(reusableMultipleToken) != null; } catch (IOException e) { - t = null; + token = null; } try { source.close(); @@ -293,16 +302,16 @@ + " - tokens were added to part1"); } + // part2 source = getAnalyzer().tokenStream(field, new StringReader(part2)); - // part2 try { - t = source.next(); - if (t != null) { - part2 = t.termText(); + token = source.next(reusableToken); + if (token != null) { + part2 = token.term(); } - multipleTokens = source.next() != null; + multipleTokens = source.next(reusableMultipleToken) != null; } catch (IOException e) { - t = null; + token = null; } try { source.close(); Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (revision 685115) +++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (working copy) @@ -1,14 +1,29 @@ /* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParser.java */ package org.apache.lucene.queryParser.precedence; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Locale; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; /** @@ -296,20 +311,21 @@ TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); + final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { - t = source.next(); + t = source.next(reusableToken); } catch (IOException e) { t = null; } if (t == null) break; - v.addElement(t); + v.addElement(t.clone()); if (t.getPositionIncrement() == 1) positionCount++; else Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj (revision 685115) +++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj (working copy) @@ -25,14 +25,29 @@ package org.apache.lucene.queryParser.precedence; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Locale; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; /** @@ -320,20 +335,21 @@ TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); + final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token t; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { - t = source.next(); + t = source.next(reusableToken); } catch (IOException e) { t = null; } if (t == null) break; - v.addElement(t); + v.addElement(t.clone()); if (t.getPositionIncrement() == 1) positionCount++; else Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/CharStream.java =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/CharStream.java (revision 685115) +++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/CharStream.java (working copy) @@ -26,6 +26,20 @@ char readChar() throws java.io.IOException; /** + * Returns the column position of the character last read. + * @deprecated + * @see #getEndColumn + */ + int getColumn(); + + /** + * Returns the line number of the character last read. + * @deprecated + * @see #getEndLine + */ + int getLine(); + + /** * Returns the column number of the last character for current token (being * matched after the last call to BeginTOken). */ Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParserTokenManager.java =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParserTokenManager.java (revision 685115) +++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParserTokenManager.java (working copy) @@ -1,13 +1,27 @@ /* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParserTokenManager.java */ package org.apache.lucene.queryParser.precedence; +import java.io.IOException; +import java.io.StringReader; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Locale; import java.util.Vector; -import java.io.*; -import java.text.*; -import java.util.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.*; -import org.apache.lucene.document.*; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; public class PrecedenceQueryParserTokenManager implements PrecedenceQueryParserConstants Index: contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java =================================================================== --- contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (revision 684150) +++ contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (working copy) @@ -126,14 +126,14 @@ tcm.put("3.25", ""); tcm.put("3.50", ""); WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); - Token token = new Token(); int count = 0; int numItalics = 0; int numBoldItalics = 0; int numCategory = 0; int numCitation = 0; - while ((token = tf.next(token)) != null) { - String tokText = token.termText(); + final Token reusableToken = new Token(); + for (Token token = tf.next(reusableToken); token != null; token = tf.next(reusableToken)) { + String tokText = token.term(); //System.out.println("Text: " + tokText + " Type: " + token.type()); assertTrue("token is null and it shouldn't be", token != null); String expectedType = (String) tcm.get(tokText); @@ -166,104 +166,104 @@ } private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException { - Token token = new Token(); - token = tf.next(token); + final Token reusableToken = new Token(); + Token token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true); + assertTrue(token.term() + " is not equal to " + "click", token.term().equals("click") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); + assertTrue(token.term() + " is not equal to " + "link", token.term().equals("link") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", - new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); + assertTrue(token.term() + " is not equal to " + "here", + token.term().equals("here") == true); //The link, and here should be at the same position for phrases to work assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again", - new String(token.termBuffer(), 0, token.termLength()).equals("again") == true); + assertTrue(token.term() + " is not equal to " + "again", + token.term().equals("again") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", - new String(token.termBuffer(), 0, token.termLength()).equals("click") == true); + assertTrue(token.term() + " is not equal to " + "click", + token.term().equals("click") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org", - new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true); + assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org", + token.term().equals("http://lucene.apache.org") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", - new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); + assertTrue(token.term() + " is not equal to " + "here", + token.term().equals("here") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again", - new String(token.termBuffer(), 0, token.termLength()).equals("again") == true); + assertTrue(token.term() + " is not equal to " + "again", + token.term().equals("again") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a", - new String(token.termBuffer(), 0, token.termLength()).equals("a") == true); + assertTrue(token.term() + " is not equal to " + "a", + token.term().equals("a") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b", - new String(token.termBuffer(), 0, token.termLength()).equals("b") == true); + assertTrue(token.term() + " is not equal to " + "b", + token.term().equals("b") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c", - new String(token.termBuffer(), 0, token.termLength()).equals("c") == true); + assertTrue(token.term() + " is not equal to " + "c", + token.term().equals("c") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(token); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d", - new String(token.termBuffer(), 0, token.termLength()).equals("d") == true); + assertTrue(token.term() + " is not equal to " + "d", + token.term().equals("d") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is not null and it should be", token == null); } public void testLinks() throws Exception { String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]"; WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); - Token token = new Token(); - token = tf.next(token); + final Token reusableToken = new Token(); + Token token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news", - new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true); + assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news", + token.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - tf.next(token);//skip here - token = tf.next(token); + tf.next(reusableToken);//skip here + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c", - new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true); + assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c", + token.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - tf.next(token);//skip here - token = tf.next(token); + tf.next(reusableToken);//skip here + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c", - new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true); + assertTrue(token.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c", + token.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is not null and it should be", token == null); } @@ -277,71 +277,71 @@ checkLinkPhrases(tf); String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks); - Token token; - token = tf.next(); + final Token reusableToken = new Token(); + Token token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d", - new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true); + assertTrue(token.term() + " is not equal to " + "a b c d", + token.term().equals("a b c d") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11); assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g", - new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true); + assertTrue(token.term() + " is not equal to " + "e f g", + token.term().equals("e f g") == true); assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32); assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", - new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); + assertTrue(token.term() + " is not equal to " + "link", + token.term().equals("link") == true); assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42); assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", - new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); + assertTrue(token.term() + " is not equal to " + "here", + token.term().equals("here") == true); assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47); assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", - new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); + assertTrue(token.term() + " is not equal to " + "link", + token.term().equals("link") == true); assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56); assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there", - new String(token.termBuffer(), 0, token.termLength()).equals("there") == true); + assertTrue(token.term() + " is not equal to " + "there", + token.term().equals("there") == true); assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61); assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here", - new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true); + assertTrue(token.term() + " is not equal to " + "italics here", + token.term().equals("italics here") == true); assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71); assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something", - new String(token.termBuffer(), 0, token.termLength()).equals("something") == true); + assertTrue(token.term() + " is not equal to " + "something", + token.term().equals("something") == true); assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86); assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics", - new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true); + assertTrue(token.term() + " is not equal to " + "more italics", + token.term().equals("more italics") == true); assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98); assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j", - new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true); + assertTrue(token.term() + " is not equal to " + "h i j", + token.term().equals("h i j") == true); assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124); assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is not null and it should be", token == null); } @@ -352,48 +352,48 @@ String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; //should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); - Token token; - token = tf.next(); + final Token reusableToken = new Token(); + Token token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d", - new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true); + assertTrue(token.term() + " is not equal to " + "a b c d", + token.term().equals("a b c d") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11); assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a", - new String(token.termBuffer(), 0, token.termLength()).equals("a") == true); + assertTrue(token.term() + " is not equal to " + "a", + token.term().equals("a") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11); assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b", - new String(token.termBuffer(), 0, token.termLength()).equals("b") == true); + assertTrue(token.term() + " is not equal to " + "b", + token.term().equals("b") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13); assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c", - new String(token.termBuffer(), 0, token.termLength()).equals("c") == true); + assertTrue(token.term() + " is not equal to " + "c", + token.term().equals("c") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15); assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d", - new String(token.termBuffer(), 0, token.termLength()).equals("d") == true); + assertTrue(token.term() + " is not equal to " + "d", + token.term().equals("d") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17); @@ -401,175 +401,175 @@ - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g", - new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true); + assertTrue(token.term() + " is not equal to " + "e f g", + token.term().equals("e f g") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32); assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e", - new String(token.termBuffer(), 0, token.termLength()).equals("e") == true); + assertTrue(token.term() + " is not equal to " + "e", + token.term().equals("e") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32); assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f", - new String(token.termBuffer(), 0, token.termLength()).equals("f") == true); + assertTrue(token.term() + " is not equal to " + "f", + token.term().equals("f") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34); assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g", - new String(token.termBuffer(), 0, token.termLength()).equals("g") == true); + assertTrue(token.term() + " is not equal to " + "g", + token.term().equals("g") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36); assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", - new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); + assertTrue(token.term() + " is not equal to " + "link", + token.term().equals("link") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42); assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", - new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); + assertTrue(token.term() + " is not equal to " + "here", + token.term().equals("here") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47); assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", - new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); + assertTrue(token.term() + " is not equal to " + "link", + token.term().equals("link") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there", - new String(token.termBuffer(), 0, token.termLength()).equals("there") == true); + assertTrue(token.term() + " is not equal to " + "there", + token.term().equals("there") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61); assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here", - new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true); + assertTrue(token.term() + " is not equal to " + "italics here", + token.term().equals("italics here") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71); assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics", - new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true); + assertTrue(token.term() + " is not equal to " + "italics", + token.term().equals("italics") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71); assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", - new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); + assertTrue(token.term() + " is not equal to " + "here", + token.term().equals("here") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79); assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something", - new String(token.termBuffer(), 0, token.termLength()).equals("something") == true); + assertTrue(token.term() + " is not equal to " + "something", + token.term().equals("something") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86); assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics", - new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true); + assertTrue(token.term() + " is not equal to " + "more italics", + token.term().equals("more italics") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98); assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more", - new String(token.termBuffer(), 0, token.termLength()).equals("more") == true); + assertTrue(token.term() + " is not equal to " + "more", + token.term().equals("more") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98); assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics", - new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true); + assertTrue(token.term() + " is not equal to " + "italics", + token.term().equals("italics") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true); assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103); assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j", - new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true); + assertTrue(token.term() + " is not equal to " + "h i j", + token.term().equals("h i j") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124); assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h", - new String(token.termBuffer(), 0, token.termLength()).equals("h") == true); + assertTrue(token.term() + " is not equal to " + "h", + token.term().equals("h") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124); assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i", - new String(token.termBuffer(), 0, token.termLength()).equals("i") == true); + assertTrue(token.term() + " is not equal to " + "i", + token.term().equals("i") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128); assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is null and it shouldn't be", token != null); - assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j", - new String(token.termBuffer(), 0, token.termLength()).equals("j") == true); + assertTrue(token.term() + " is not equal to " + "j", + token.term().equals("j") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true); assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132); assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133); - token = tf.next(); + token = tf.next(reusableToken); assertTrue("token is not null and it should be", token == null); } Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java =================================================================== --- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (revision 684150) +++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (working copy) @@ -134,6 +134,7 @@ * @see org.apache.lucene.analysis.TokenStream#next() */ public Token next(Token result) throws IOException { + assert result != null; if (tokens != null && tokens.hasNext()){ return (Token)tokens.next(); } Index: contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (revision 684150) +++ contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (working copy) @@ -197,9 +197,9 @@ private List getTokens(TokenStream stream) throws IOException { ArrayList tokens = new ArrayList(); - Token token; - while ((token = stream.next()) != null) { - tokens.add(token); + final Token reusableToken = new Token(); + for (Token token = stream.next(reusableToken); token != null; token = stream.next(reusableToken)) { + tokens.add(token.clone()); } return tokens; } @@ -211,7 +211,7 @@ for (; i < size; i++) { Token t1 = (Token) tokens1.get(i); Token t2 = (Token) tokens2.get(i); - if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText"); + if (!(t1.term().equals(t2.term()))) throw new IllegalStateException("termText"); if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset"); if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset"); if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type"); @@ -222,8 +222,8 @@ catch (IllegalStateException e) { if (size > 0) { System.out.println("i=" + i + ", size=" + size); - System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'"); - System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'"); + System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).term() + "'"); + System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).term() + "'"); } throw e; } @@ -234,7 +234,7 @@ String str = "["; for (int i=0; i < tokens.size(); i++) { Token t1 = (Token) tokens.get(i); - str = str + "'" + t1.termText() + "', "; + str = str + "'" + t1.term() + "', "; } return str + "]"; } Index: contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 684150) +++ contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -275,7 +275,8 @@ return new TokenStream() { private Iterator iter = keywords.iterator(); private int start = 0; - public Token next() { + public Token next(Token token) { + assert token != null; if (!iter.hasNext()) return null; Object obj = iter.next(); @@ -283,7 +284,7 @@ throw new IllegalArgumentException("keyword must not be null"); String term = obj.toString(); - Token token = new Token(term, start, start + term.length()); + token.reinit(term, start, start+token.termLength()); start += term.length() + 1; // separate words by 1 (blank) character return token; } @@ -349,10 +350,9 @@ HashMap terms = new HashMap(); int numTokens = 0; int pos = -1; - Token token; - - while ((token = stream.next()) != null) { - String term = token.termText(); + final Token reusableToken = new Token(); + for (Token token = stream.next(reusableToken); token != null; token = stream.next(reusableToken)) { + String term = token.term(); if (term.length() == 0) continue; // nothing to do // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; Index: contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (revision 684150) +++ contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (working copy) @@ -73,8 +73,9 @@ return new TokenFilter(child.tokenStream(fieldName, reader)) { private int position = -1; - public Token next() throws IOException { - Token token = input.next(); // from filter super class + public Token next(Token token) throws IOException { + assert token != null; + token = input.next(token); // from filter super class log.println(toString(token)); return token; } @@ -84,7 +85,7 @@ position += token.getPositionIncrement(); return "[" + logName + ":" + position + ":" + fieldName + ":" - + token.termText() + ":" + token.startOffset() + + token.term() + ":" + token.startOffset() + "-" + token.endOffset() + ":" + token.type() + "]"; } @@ -121,8 +122,9 @@ return new TokenFilter(child.tokenStream(fieldName, reader)) { private int todo = maxTokens; - public Token next() throws IOException { - return --todo >= 0 ? input.next() : null; + public Token next(Token token) throws IOException { + assert token != null; + return --todo >= 0 ? input.next(token) : null; } }; } @@ -239,9 +241,10 @@ final ArrayList tokens2 = new ArrayList(); TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) { - public Token next() throws IOException { - Token token = input.next(); // from filter super class - if (token != null) tokens2.add(token); + public Token next(Token token) throws IOException { + assert token != null; + token = input.next(token); // from filter super class + if (token != null) tokens2.add(token.clone()); return token; } }; @@ -253,7 +256,8 @@ private Iterator iter = tokens.iterator(); - public Token next() { + public Token next(Token token) { + assert token != null; if (!iter.hasNext()) return null; return (Token) iter.next(); } @@ -300,12 +304,12 @@ HashMap map = new HashMap(); TokenStream stream = analyzer.tokenStream("", new StringReader(text)); try { - Token token; - while ((token = stream.next()) != null) { - MutableInteger freq = (MutableInteger) map.get(token.termText()); + final Token reusableToken = new Token(); + for (Token token = stream.next(reusableToken); token != null; token = stream.next(reusableToken)) { + MutableInteger freq = (MutableInteger) map.get(token.term()); if (freq == null) { freq = new MutableInteger(1); - map.put(token.termText(), freq); + map.put(token.term(), freq); } else { freq.setValue(freq.intValue() + 1); } Index: contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (revision 684150) +++ contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (working copy) @@ -334,7 +334,8 @@ this.toLowerCase = toLowerCase; } - public Token next() { + public Token next(Token token) { + assert token != null; if (matcher == null) return null; while (true) { // loop takes care of leading and trailing boundary cases @@ -352,7 +353,7 @@ if (start != end) { // non-empty match (header/trailer) String text = str.substring(start, end); if (toLowerCase) text = text.toLowerCase(locale); - return new Token(text, start, end); + return token.reinit(text, start, end); } if (!isMatch) return null; } @@ -384,7 +385,8 @@ this.stopWords = stopWords; } - public Token next() { + public Token next(Token token) { + assert token != null; // cache loop instance vars (performance) String s = str; int len = s.length(); @@ -422,7 +424,11 @@ } while (text != null && isStopWord(text)); pos = i; - return text != null ? new Token(text, start, i) : null; + if (text == null) + { + return null; + } + return token.reinit(text, start, i); } private boolean isTokenChar(char c, boolean isLetter) { Index: contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (revision 684150) +++ contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (working copy) @@ -68,23 +68,23 @@ } /** Returns the next token in the stream, or null at EOS. */ - public Token next() throws IOException { - Token token; + public Token next(Token token) throws IOException { + assert token != null; while (todo > 0 && index < stack.length) { // pop from stack - token = createToken(stack[index++], current); + token = createToken(stack[index++], current, token); if (token != null) { todo--; return token; } } - token = input.next(); + token = input.next(token); if (token == null) return null; // EOS; iterator exhausted - stack = synonyms.getSynonyms(token.termText()); // push onto stack + stack = synonyms.getSynonyms(token.term()); // push onto stack if (stack.length > maxSynonyms) randomize(stack); index = 0; - current = token; + current = (Token) token.clone(); todo = maxSynonyms; return token; } @@ -100,11 +100,15 @@ * @return a new token, or null to indicate that the given synonym should be * ignored */ - protected Token createToken(String synonym, Token current) { - Token token = new Token( - synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE); - token.setPositionIncrement(0); - return token; + protected Token createToken(String synonym, Token current, Token result) { + result.setTermBuffer(synonym); + result.setStartOffset(current.startOffset()); + result.setEndOffset(current.endOffset()); + result.setType(SYNONYM_TOKEN_TYPE); + result.setPositionIncrement(0); + result.setPayload(current.getPayload()); + result.setFlags(current.getFlags()); + return result; } /** Index: contrib/lucli/src/java/lucli/LuceneMethods.java =================================================================== --- contrib/lucli/src/java/lucli/LuceneMethods.java (revision 684150) +++ contrib/lucli/src/java/lucli/LuceneMethods.java (working copy) @@ -279,6 +279,7 @@ Analyzer analyzer = new StandardAnalyzer(); Enumeration fields = doc.fields(); + final Token reusableToken = new Token(); while (fields.hasMoreElements()) { Field field = (Field) fields.nextElement(); String fieldName = field.name(); @@ -299,10 +300,10 @@ // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); try { - for (Token t = stream.next(); t != null; t = stream.next()) { - position += (t.getPositionIncrement() - 1); + for (Token token = stream.next(reusableToken); token != null; token = stream.next(reusableToken)) { + position += (token.getPositionIncrement() - 1); position++; - String name = t.termText(); + String name = token.term(); Integer Count = (Integer) tokenHash.get(name); if (Count == null) { // not in there yet tokenHash.put(name, new Integer(1)); //first one Index: contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy) @@ -33,12 +33,11 @@ { String s = "a天b"; ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s)); - Token token; int correctStartOffset = 0; int correctEndOffset = 1; - while ((token = tokenizer.next()) != null) - { + final Token reusableToken = new Token(); + for (Token token = tokenizer.next(reusableToken); token != null; token = tokenizer.next(reusableToken)) { assertEquals(correctStartOffset, token.startOffset()); assertEquals(correctEndOffset, token.endOffset()); correctStartOffset++; Index: contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy) @@ -42,12 +42,14 @@ */ private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + Token token; for (int i=0; i tokens; - // test a plain old token stream with synonyms tranlated to rows. + // test a plain old token stream with synonyms translated to rows. tokens = new LinkedList(); - tokens.add(new Token("please", 0, 6)); - tokens.add(new Token("divide", 7, 13)); - tokens.add(new Token("this", 14, 18)); - tokens.add(new Token("sentence", 19, 27)); - tokens.add(new Token("into", 28, 32)); - tokens.add(new Token("shingles", 33, 39)); + tokens.add(createToken("please", 0, 6)); + tokens.add(createToken("divide", 7, 13)); + tokens.add(createToken("this", 14, 18)); + tokens.add(createToken("sentence", 19, 27)); + tokens.add(createToken("into", 28, 32)); + tokens.add(createToken("shingles", 33, 39)); tls = new TokenListStream(tokens); @@ -70,21 +64,23 @@ ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); - assertNext(ts, "please", 0, 6); - assertNext(ts, "please divide", 0, 13); - assertNext(ts, "divide", 7, 13); - assertNext(ts, "divide this", 7, 18); - assertNext(ts, "this", 14, 18); - assertNext(ts, "this sentence", 14, 27); - assertNext(ts, "sentence", 19, 27); - assertNext(ts, "sentence into", 19, 32); - assertNext(ts, "into", 28, 32); - assertNext(ts, "into shingles", 28, 39); - assertNext(ts, "shingles", 33, 39); + Token token = new Token(); + assertNext(ts, token, "please", 0, 6); + assertNext(ts, token, "please divide", 0, 13); + assertNext(ts, token, "divide", 7, 13); + assertNext(ts, token, "divide this", 7, 18); + assertNext(ts, token, "this", 14, 18); + assertNext(ts, token, "this sentence", 14, 27); + assertNext(ts, token, "sentence", 19, 27); + assertNext(ts, token, "sentence into", 19, 32); + assertNext(ts, token, "into", 28, 32); + assertNext(ts, token, "into shingles", 28, 39); + assertNext(ts, token, "shingles", 33, 39); - assertNull(ts.next()); + assertNull(ts.next(token)); + } /** @@ -95,9 +91,6 @@ ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); - Token token = new Token(); // for debug use only - - TokenStream ts; TokenListStream tls; LinkedList tokens; @@ -117,25 +110,27 @@ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "hello_world"); - assertNext(ts, "greetings_world"); - assertNext(ts, "hello_earth"); - assertNext(ts, "greetings_earth"); - assertNext(ts, "hello_tellus"); - assertNext(ts, "greetings_tellus"); - assertNull(ts.next()); + Token token = new Token(); + assertNext(ts, token, "hello_world"); + assertNext(ts, token, "greetings_world"); + assertNext(ts, token, "hello_earth"); + assertNext(ts, token, "greetings_earth"); + assertNext(ts, token, "hello_tellus"); + assertNext(ts, token, "greetings_tellus"); + assertNull(ts.next(token)); // bi-grams with no spacer character, start offset, end offset tls.reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "helloworld", 0, 10); - assertNext(ts, "greetingsworld", 0, 10); - assertNext(ts, "helloearth", 0, 10); - assertNext(ts, "greetingsearth", 0, 10); - assertNext(ts, "hellotellus", 0, 10); - assertNext(ts, "greetingstellus", 0, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "helloworld", 0, 10); + assertNext(ts, token, "greetingsworld", 0, 10); + assertNext(ts, token, "helloearth", 0, 10); + assertNext(ts, token, "greetingsearth", 0, 10); + assertNext(ts, token, "hellotellus", 0, 10); + assertNext(ts, token, "greetingstellus", 0, 10); + assertNull(ts.next(token)); // add ^_prefix_and_suffix_$ @@ -160,119 +155,122 @@ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); // -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNull(ts.next(token)); // test unlimited size and allow single boundary token as shingle tls.reset(); ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false); // -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^", 1, 10.0f, 0, 0); - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "$", 1, 7.071068f, 10, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + token = new Token(); + assertNext(ts, token, "^", 1, 10.0f, 0, 0); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "$", 1, 7.071068f, 10, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); // test unlimited size but don't allow single boundary token as shingle tls.reset(); ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); System.currentTimeMillis(); @@ -300,27 +298,28 @@ ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } // shingle, position increment, weight, start offset, end offset - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); - assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); - assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); - assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); + token = new Token(); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_and", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "greetings_and_salutations", 1, 1.7320508f, 0, 4); + assertNext(ts, token, "and_salutations", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "and_salutations_world", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_earth", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_tellus", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_tellus", 1, 1.4142135f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); System.currentTimeMillis(); @@ -361,53 +360,53 @@ TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec()); -// Token token = new Token(); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0); - assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0); - assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0); - assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0); - assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0); - assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0); - assertNext(ts, "to_see", 1, 1.4142135f, 0, 0); - assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0); - assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0); - assertNext(ts, "see_england", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0); - assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0); - assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0); - assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0); - assertNext(ts, "in_the", 1, 1.4142135f, 0, 0); - assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0); - assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0); - assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); - assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0); - assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); - assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); - assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0); - assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0); + Token token = new Token(); + assertNext(ts, token, "no_surprise", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "no_surprise_to", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "no_surprise_to_see", 1, 2.0f, 0, 0); + assertNext(ts, token, "surprise_to", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "surprise_to_see", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "surprise_to_see_england", 1, 2.0f, 0, 0); + assertNext(ts, token, "to_see", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "to_see_england", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "to_see_england_manager", 1, 2.0f, 0, 0); + assertNext(ts, token, "see_england", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "see_england_manager_svennis", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "england_manager_svennis", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_svennis_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_svennis", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_svennis_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_svennis_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "svennis_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "svennis_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "svennis_in_the_croud", 1, 2.0f, 0, 0); + assertNext(ts, token, "in_the", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "in_the_croud", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "the_croud", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager_sven", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager_sven", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_sven_göran", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_sven", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_sven_göran", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); + assertNext(ts, token, "sven_göran", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "göran_eriksson", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "göran_eriksson_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "göran_eriksson_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "eriksson_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "eriksson_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "eriksson_in_the_croud", 1, 2.0f, 0, 0); - assertNull(ts.next()); + assertNull(ts.next(token)); } @@ -417,11 +416,9 @@ private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); return token; } @@ -435,48 +432,44 @@ } private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); return token; } private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner); return token; } // assert-methods start here - private Token assertNext(TokenStream ts, String text) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text) throws IOException { + ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost) throws IOException { + token = ts.next(new Token()); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(positionIncrement, token.getPositionIncrement()); assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(positionIncrement, token.getPositionIncrement()); assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); assertEquals(startOffset, token.startOffset()); @@ -484,25 +477,32 @@ return token; } - private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(startOffset, token.startOffset()); assertEquals(endOffset, token.endOffset()); return token; } + private static Token createToken(String term, int start, int offset) + { + Token token = new Token(start, offset); + token.setTermBuffer(term); + return token; + } + public static class TokenListStream extends TokenStream { private Collection tokens; public TokenListStream(TokenStream ts) throws IOException { tokens = new ArrayList(); - Token token; - while ((token = ts.next(new Token())) != null) { - tokens.add(token); + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + tokens.add((Token) token.clone()); } } @@ -512,14 +512,16 @@ private Iterator iterator; - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (iterator == null) { iterator = tokens.iterator(); } if (!iterator.hasNext()) { return null; } - return iterator.next(); + token = (Token) iterator.next(); + return (Token) token.clone(); } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy) @@ -36,13 +36,14 @@ throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - + final Token reusableToken = new Token(); + Token token; for (int i = 0; i < output.length; i++) { - Token t = ts.next(); - assertNotNull(t); - assertEquals(t.termText(), output[i]); + token = ts.next(reusableToken); + assertNotNull(token); + assertEquals(token.term(), output[i]); } - assertNull(ts.next()); + assertNull(ts.next(reusableToken)); ts.close(); } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (working copy) @@ -43,10 +43,10 @@ String test = "The quick red fox jumped over the lazy brown dogs"; NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D"); - Token tok = new Token(); boolean seenDogs = false; - while ((tok = nptf.next(tok)) != null){ - if (tok.termText().equals("dogs")){ + final Token reusableToken = new Token(); + for (Token tok = nptf.next(reusableToken); tok != null; tok = nptf.next(reusableToken)) { + if (tok.term().equals("dogs")){ seenDogs = true; assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true); assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); @@ -68,8 +68,9 @@ } public Token next(Token result) throws IOException { + assert result != null; result = input.next(result); - if (result != null && result.termText().equals("dogs")) { + if (result != null && result.term().equals("dogs")) { result.setType("D"); } return result; Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (working copy) @@ -44,9 +44,9 @@ String test = "The quick red fox jumped over the lazy brown dogs"; TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test)))); - Token tok = new Token(); int count = 0; - while ((tok = nptf.next(tok)) != null){ + final Token reusableToken = new Token(); + for (Token tok = nptf.next(reusableToken); tok != null; tok = nptf.next(reusableToken)) { assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0])))); assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); String type = new String(tok.getPayload().getData(), "UTF-8"); @@ -65,6 +65,7 @@ public Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null) { result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0]))); Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (revision 684150) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (working copy) @@ -42,9 +42,9 @@ String test = "The quick red fox jumped over the lazy brown dogs"; TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test))); - Token tok = new Token(); int count = 0; - while ((tok = nptf.next(tok)) != null){ + final Token reusableToken = new Token(); + for (Token tok = nptf.next(reusableToken); tok != null; tok = nptf.next(reusableToken)) { assertTrue("tok is null and it shouldn't be", tok != null); Payload pay = tok.getPayload(); assertTrue("pay is null and it shouldn't be", pay != null); Index: contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -105,12 +105,13 @@ return dict; } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (tokens.size() > 0) { return (Token)tokens.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -145,17 +146,15 @@ protected final Token createToken(final int offset, final int length, final Token prototype) { - Token t = new Token(prototype.startOffset() + offset, prototype - .startOffset() - + offset + length, prototype.type()); - t.setTermBuffer(prototype.termBuffer(), offset, length); + int newStart = prototype.startOffset() + offset; + Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length); t.setPositionIncrement(0); return t; } protected void decompose(final Token token) { // In any case we give the original token back - tokens.add(token); + tokens.add((Token) token.clone()); // Only words longer than minWordSize get processed if (token.termLength() < this.minWordSize) { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -37,25 +37,20 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + assert token != null; + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -35,7 +35,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private RussianStemmer stemmer = null; public RussianStemFilter(TokenStream in, char[] charset) @@ -47,22 +46,18 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() throws IOException + public final Token next(Token token) throws IOException { - if ((token = input.next()) == null) + assert token != null; + if ((token = input.next(token)) == null) { return null; } - else - { - String s = stemmer.stem(token.termText()); - if (!s.equals(token.termText())) - { - return new Token(s, token.startOffset(), token.endOffset(), - token.type()); - } - return token; - } + String term = token.term(); + String s = stemmer.stem(term); + if (s != null && !s.equals(term)) + token.setTermBuffer(s); + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (working copy) @@ -48,7 +48,7 @@ public void add(Token t) { //check to see if this is a Category if (t != null && typeToMatch.equals(t.type())){ - lst.add(t.clone()); + super.add(t); } } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (working copy) @@ -73,10 +73,10 @@ //Check to see if this token is a date if (t != null) { try { - Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date + Date date = dateFormat.parse(t.term());//We don't care about the date, just that we can parse it as a date if (date != null) { t.setType(DATE_TYPE); - lst.add(t.clone()); + super.add(t); } } catch (ParseException e) { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -37,7 +37,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private GermanStemmer stemmer = null; private Set exclusionSet = null; @@ -48,7 +47,7 @@ } /** - * Builds a GermanStemFilter that uses an exclusiontable. + * Builds a GermanStemFilter that uses an exclusion table. */ public GermanStemFilter( TokenStream in, Set exclusionSet ) { @@ -59,25 +58,22 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() + public final Token next(Token token) throws IOException { - if ( ( token = input.next() ) == null ) { + assert token != null; + if ( ( token = input.next(token) ) == null ) { return null; } - // Check the exclusiontable - else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { - return token; + String term = token.term(); + // Check the exclusion table. + if (exclusionSet == null || !exclusionSet.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + token.setTermBuffer(s); } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); - } - return token; - } + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -47,7 +47,7 @@ /** * filler token for when positionIncrement is more than 1 */ - public static final String FILLER_TOKEN = "_"; + public static final char[] FILLER_TOKEN = { '_' }; /** @@ -150,11 +150,12 @@ } /* (non-Javadoc) - * @see org.apache.lucene.analysis.TokenStream#next() - */ - public Token next() throws IOException { + * @see org.apache.lucene.analysis.TokenStream#next() + */ + public Token next(Token token) throws IOException { + assert token != null; if (outputBuf.isEmpty()) { - fillOutputBuf(); + fillOutputBuf(token); } Token nextToken = null; if ( ! outputBuf.isEmpty()) @@ -173,16 +174,19 @@ * @return the next token, or null if at end of input stream * @throws IOException if the input stream has a problem */ - private Token getNextToken() throws IOException { + private Token getNextToken(Token token) throws IOException { if (tokenBuf.isEmpty()) { - Token lastToken = input.next(); - if (lastToken != null) { - for (int i = 1; i < lastToken.getPositionIncrement(); i++) { - tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(), - lastToken.startOffset())); + token = input.next(token); + if (token != null) { + for (int i = 1; i < token.getPositionIncrement(); i++) { + Token fillerToken = (Token) token.clone(); + // A filler token occupies no space + fillerToken.setEndOffset(fillerToken.startOffset()); + fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + tokenBuf.add(fillerToken); } - tokenBuf.add(lastToken); - return getNextToken(); + tokenBuf.add(token.clone()); + return getNextToken(token); } else { return null; } @@ -196,15 +200,15 @@ * * @throws IOException if there's a problem getting the next token */ - private void fillOutputBuf() throws IOException { + private void fillOutputBuf(Token token) throws IOException { boolean addedToken = false; /* * Try to fill the shingle buffer. */ do { - Token token = getNextToken(); + token = getNextToken(token); if (token != null) { - shingleBuf.add(token); + shingleBuf.add(token.clone()); if (shingleBuf.size() > maxShingleSize) { shingleBuf.remove(0); @@ -235,17 +239,17 @@ } int i = 0; - Token token = null; + Token shingle = null; for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) { - token = (Token) it.next(); + shingle = (Token) it.next(); for (int j = i; j < shingles.length; j++) { if (shingles[j].length() != 0) { shingles[j].append(TOKEN_SEPARATOR); } - shingles[j].append(token.termBuffer(), 0, token.termLength()); + shingles[j].append(shingle.termBuffer(), 0, shingle.termLength()); } - endOffsets[i] = token.endOffset(); + endOffsets[i] = shingle.endOffset(); i++; } @@ -258,17 +262,26 @@ /* * Push new tokens to the output buffer. */ + if (!shingleBuf.isEmpty()) { + Token firstShingle = (Token) shingleBuf.get(0); + shingle = (Token) firstShingle.clone(); + shingle.setType(tokenType); + } for (int j = 1; j < shingleBuf.size(); j++) { - Token shingle = new Token(shingles[j].toString(), - ((Token) shingleBuf.get(0)).startOffset(), - endOffsets[j], - tokenType); + shingle.setEndOffset(endOffsets[j]); + StringBuffer buf = shingles[j]; + int termLength = buf.length(); + char[] termBuffer = shingle.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = shingle.resizeTermBuffer(termLength); + buf.getChars(0, termLength, termBuffer, 0); + shingle.setTermLength(termLength); if ((! outputUnigrams) && j == 1) { shingle.setPositionIncrement(1); } else { shingle.setPositionIncrement(0); } - outputBuf.add(shingle); + outputBuf.add(shingle.clone()); } } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy) @@ -17,16 +17,23 @@ * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.index.Payload; -import java.io.IOException; -import java.util.*; - /** *

    A ShingleFilter constructs shingles (token n-grams) from a token stream. * In other words, it creates combinations of tokens as a single token. @@ -299,6 +306,7 @@ private Matrix matrix; public Token next(Token token) throws IOException { + assert token != null; if (matrix == null) { matrix = new Matrix(); // fill matrix with maximumShingleSize columns @@ -340,14 +348,14 @@ } // shingle token factory - StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future. + StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. for (Token shingleToken : shingle) { if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength()); } - token.setTermText(sb.toString()); + token.setTermBuffer(sb.toString()); updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); return token; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy) @@ -18,8 +18,11 @@ */ import java.util.Hashtable; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + /** * Title: ChineseFilter * Description: Filter with a stop word table @@ -61,10 +64,11 @@ stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); } - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { + assert token != null; - for (Token token = input.next(); token != null; token = input.next()) { - String text = token.termText(); + for (token = input.next(token); token != null; token = input.next(token)) { + String text = token.term(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable.get(text) == null) { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -19,9 +19,11 @@ import java.io.Reader; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + /** * Title: ChineseTokenizer * Description: Extract tokens from the Stream using Character.getType() @@ -75,17 +77,19 @@ } - private final Token flush() { + private final Token flush(Token token) { if (length>0) { - //System.out.println(new String(buffer, 0, length)); - return new Token(new String(buffer, 0, length), start, start+length); + //System.out.println(new String(buffer, 0, + //length)); + return token.reinit(buffer, 0, length, start, start+length); } else return null; } - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { + assert token != null; length = 0; start = offset; @@ -101,7 +105,7 @@ bufferIndex = 0; } - if (dataLen == -1) return flush(); + if (dataLen == -1) return flush(token); else c = ioBuffer[bufferIndex++]; @@ -112,20 +116,20 @@ case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); - if (length == MAX_WORD_LEN) return flush(); + if (length == MAX_WORD_LEN) return flush(token); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; - return flush(); + return flush(token); } push(c); - return flush(); + return flush(token); default: - if (length>0) return flush(); + if (length>0) return flush(token); break; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -35,25 +35,20 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + assert token != null; + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy) @@ -36,7 +36,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private BrazilianStemmer stemmer = null; private Set exclusions = null; @@ -53,22 +52,23 @@ /** * @return Returns the next token in the stream, or null at EOS. */ - public final Token next() + public final Token next(Token token) throws IOException { - if ((token = input.next()) == null) { + assert token != null; + if ((token = input.next(token)) == null) { return null; } - // Check the exclusiontable. - else if (exclusions != null && exclusions.contains(token.termText())) { - return token; - } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token. - if ((s != null) && !s.equals(token.termText())) { - return new Token(s, token.startOffset(), token.endOffset(), token.type()); - } - return token; + + String term = token.term(); + + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + token.setTermBuffer(s); } + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy) @@ -28,20 +28,22 @@ public class SingleTokenTokenStream extends TokenStream { private boolean exhausted = false; + // The token needs to be immutable, so work with clones! private Token token; public SingleTokenTokenStream(Token token) { - this.token = token; + this.token = (Token) token.clone(); } public Token next(Token result) throws IOException { + assert token != null; if (exhausted) { return null; } exhausted = true; - return token; + return (Token) token.clone(); } @@ -50,10 +52,10 @@ } public Token getToken() { - return token; + return (Token) token.clone(); } public void setToken(Token token) { - this.token = token; + this.token = (Token) token.clone(); } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (working copy) @@ -46,6 +46,7 @@ private boolean prefixExhausted; public Token next(Token result) throws IOException { + assert result != null; Token buf = result; @@ -124,7 +125,6 @@ if (source.termBuffer() != null) { setTermBuffer(source.termBuffer(), 0, source.termLength()); } else { - setTermText(null); setTermLength(0); } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy) @@ -27,18 +27,8 @@ */ public class EmptyTokenStream extends TokenStream { - public Token next() throws IOException { - return null; - } - public Token next(Token result) throws IOException { + assert result != null; return null; } - - public void reset() throws IOException { - } - - public void close() throws IOException { - } - } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (working copy) @@ -56,6 +56,7 @@ public Token next(Token result) throws IOException { + assert result != null; return suffix.next(result); } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) @@ -64,7 +64,8 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { + assert token != null; if (!started) { started = true; gramSize = minGram; @@ -82,9 +83,9 @@ if (pos+gramSize > inLen) return null; } - String gram = inStr.substring(pos, pos+gramSize); + int oldPos = pos; pos++; - return new Token(gram, oldPos, oldPos+gramSize); + return token.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize); } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy) @@ -115,15 +115,15 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { + assert token != null; if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); - if (token == null) { + token = input.next(token); + if (token == null) return null; - } ngram(token); if (ngrams.size() > 0) @@ -133,12 +133,12 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + int termLength = token.termLength(); + char[] termBuffer = token.termBuffer(); int gramSize = minGram; while (gramSize <= maxGram) { // if the remaining input is too short, we can't generate any n-grams - if (gramSize > inLen) { + if (gramSize > termLength) { return; } @@ -147,13 +147,13 @@ return; } - Token tok; - if (side == Side.FRONT) { - tok = new Token(inStr.substring(0, gramSize), 0, gramSize); - } - else { - tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : termLength - gramSize; + int end = start + gramSize; + Token tok = (Token) token.clone(); + tok.setStartOffset(start); + tok.setEndOffset(end); + tok.setTermBuffer(termBuffer, start, gramSize); ngrams.add(tok); gramSize++; } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side; import java.io.IOException; import java.io.Reader; @@ -113,13 +114,14 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { + assert token != null; // if we are just starting, read the whole input if (!started) { started = true; char[] chars = new char[1024]; input.read(chars); - inStr = new String(chars).trim(); // remove any trailing empty strings + inStr = new String(chars).trim(); // remove any leading or trailing spaces inLen = inStr.length(); gramSize = minGram; } @@ -134,15 +136,13 @@ return null; } - Token tok; - if (side == Side.FRONT) { - tok = new Token(inStr.substring(0, gramSize), 0, gramSize); - } - else { - tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } - + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : inLen - gramSize; + int end = start + gramSize; + token.setTermBuffer(inStr, start, gramSize); + token.setStartOffset(start); + token.setEndOffset(end); gramSize++; - return tok; + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy) @@ -63,12 +63,13 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { + assert token != null; if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -81,16 +82,13 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + char[] termBuffer = token.termBuffer(); + int termLength = token.termLength(); int gramSize = minGram; while (gramSize <= maxGram) { int pos = 0; // reset to beginning of string - while (pos+gramSize <= inLen) { // while there is input - String gram = inStr.substring(pos, pos+gramSize); - Token tok = new Token(gram, pos, pos+gramSize); -// tok.setPositionIncrement(pos); - ngrams.add(tok); + while (pos+gramSize <= termLength) { // while there is input + ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize)); pos++; } gramSize++; // increase n-gram size Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -26,7 +26,7 @@ /** * CJKTokenizer was modified from StopTokenizer which does a decent job for * most European languages. It performs other token methods for double-byte - * Characters: the token will return at each two charactors with overlap match.
    + * Characters: the token will return at each two characters with overlap match.
    * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it * also need filter filter zero length token ""
    * for Digit: digit, '+', '#' will token as letter
    @@ -96,24 +96,26 @@ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html * for detail. * + * @param token a reusable token * @return Token * * @throws java.io.IOException - throw IOException when read error
    - * hanppened in the InputStream + * happened in the InputStream * */ - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { /** how many character(s) has been stored in buffer */ + assert token != null; int length = 0; /** the position used to create Token */ int start = offset; while (true) { - /** current charactor */ + /** current character */ char c; - /** unicode block of current charactor for detail */ + /** unicode block of current character for detail */ Character.UnicodeBlock ub; offset++; @@ -198,7 +200,7 @@ } } } else { - // non-ASCII letter, eg."C1C2C3C4" + // non-ASCII letter, e.g."C1C2C3C4" if (Character.isLetter(c)) { if (length == 0) { start = offset - 1; @@ -236,8 +238,6 @@ } } - return new Token(new String(buffer, 0, length), start, start + length, - tokenType - ); + return token.reinit(buffer, 0, length, start, start+length, tokenType); } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy) @@ -37,12 +37,11 @@ /** * The actual token in the input stream. */ - private Token token = null; private FrenchStemmer stemmer = null; private Set exclusions = null; public FrenchStemFilter( TokenStream in ) { - super(in); + super(in); stemmer = new FrenchStemmer(); } @@ -55,23 +54,22 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() + public final Token next(Token token) throws IOException { - if ( ( token = input.next() ) == null ) { + assert token != null; + if ( ( token = input.next(token) ) == null ) { return null; } - // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { - return token; + String term = token.term(); + + // Check the exclusion table + if ( exclusions == null || !exclusions.contains( term ) ) { + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals( term ) ) + token.setTermBuffer(s); } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), token.endOffset(), token.type()); - } - return token; - } + return token; } /** * Set a alternative/custom FrenchStemmer for this filter. Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy) @@ -38,7 +38,7 @@ public class ElisionFilter extends TokenFilter { private Set articles = null; - private static String apostrophes = "'’"; + private static char[] apostrophes = {'\'', '’'}; public void setArticles(Set articles) { this.articles = new HashSet(); @@ -74,25 +74,36 @@ } /** - * Returns the next input Token whith termText() without elisioned start + * Returns the next input Token with term() without elisioned start */ - public Token next() throws IOException { - Token t = input.next(); - if (t == null) + public Token next(Token token) throws IOException { + assert token != null; + token = input.next(token); + if (token == null) return null; - String text = t.termText(); - System.out.println(text); - int minPoz = -1; - int poz; - for (int i = 0; i < apostrophes.length(); i++) { - poz = text.indexOf(apostrophes.charAt(i)); - if (poz != -1) - minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz); + + char[] termBuffer = token.termBuffer(); + int termLength = token.termLength(); + + int minPoz = Integer.MAX_VALUE; + for (int i = 0; i < apostrophes.length; i++) { + char apos = apostrophes[i]; + // The equivalent of String.indexOf(ch) + for (int poz = 0; poz < termLength ; poz++) { + if (termBuffer[poz] == apos) { + minPoz = Math.min(poz, minPoz); + break; + } + } } - if (minPoz != -1 - && articles.contains(text.substring(0, minPoz).toLowerCase())) - text = text.substring(minPoz + 1); - return new Token(text, t.startOffset(), t.endOffset(), t.type()); + + // An apostrophe has been found. If the prefix is an article strip it off. + if (minPoz != Integer.MAX_VALUE + && articles.contains(new String(token.termBuffer(), 0, minPoz).toLowerCase())) { + token.setTermBuffer(token.termBuffer(), minPoz + 1, token.termLength() - (minPoz + 1)); + } + + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy) @@ -38,7 +38,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private DutchStemmer stemmer = null; private Set exclusions = null; @@ -48,7 +47,7 @@ } /** - * Builds a DutchStemFilter that uses an exclusiontable. + * Builds a DutchStemFilter that uses an exclusion table. */ public DutchStemFilter(TokenStream _in, Set exclusiontable) { this(_in); @@ -66,23 +65,21 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public Token next() throws IOException { - if ((token = input.next()) == null) { + public Token next(Token token) throws IOException { + assert token != null; + if ((token = input.next(token)) == null) { return null; } + String term = token.term(); - // Check the exclusiontable - else if (exclusions != null && exclusions.contains(token.termText())) { - return token; - } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token - if (!s.equals(token.termText())) { - return new Token(s, token.startOffset(), - token.endOffset(), token.type()); - } - return token; + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + token.setTermBuffer(s); } + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -40,31 +40,41 @@ breaker = BreakIterator.getWordInstance(new Locale("th")); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { + assert token != null; if (thaiToken != null) { - String text = thaiToken.termText(); int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(start, end), - thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type()); + token.setTermBuffer(thaiToken.termBuffer(), start, end - start); + token.setStartOffset(thaiToken.startOffset()+start); + token.setEndOffset(thaiToken.endOffset()+end); + token.setType(thaiToken.type()); + token.setPayload(thaiToken.getPayload()); + token.setFlags(thaiToken.getFlags()); + return token; } thaiToken = null; } - Token tk = input.next(); - if (tk == null) { + + token = input.next(token); + if (token == null || token.termLength() == 0) { return null; } - String text = tk.termText(); + + String text = token.term(); if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type()); + token.setTermBuffer(text.toLowerCase()); + return token; } - thaiToken = tk; + + thaiToken = (Token) token.clone(); breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(0, end), - thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type()); + token.setTermBuffer(text, 0, end); + token.setEndOffset(token.startOffset() + end); + return token; } return null; } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy) @@ -42,6 +42,7 @@ } public Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null && result.type().equals(typeMatch)){ result.setPayload(thePayload); Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy) @@ -40,6 +40,7 @@ public Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null && result.type() != null && result.type().equals("") == false){ result.setPayload(new Payload(result.type().getBytes("UTF-8"))); Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 684150) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy) @@ -39,6 +39,7 @@ } public Token next(Token result) throws IOException { + assert result != null; result = input.next(result); if (result != null){ byte[] data = new byte[8]; Index: contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (revision 684150) +++ contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; @@ -808,10 +809,11 @@ throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); - org.apache.lucene.analysis.Token token; int tokenCount=0; - while ((token = ts.next()) != null) { // for every token - String word = token.termText(); + // for every token + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + String word = token.term(); tokenCount++; if(tokenCount>maxNumTokensParsed) { @@ -872,7 +874,7 @@ * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. * * @param r the reader that has the content of the document - * @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first + * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * * @see #retrieveInterestingTerms */ Index: contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (revision 684150) +++ contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (working copy) @@ -21,6 +21,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -85,12 +86,11 @@ throws IOException { TokenStream ts = a.tokenStream( field, new StringReader( body)); - org.apache.lucene.analysis.Token t; BooleanQuery tmp = new BooleanQuery(); Set already = new HashSet(); // ignore dups - while ( (t = ts.next()) != null) - { - String word = t.termText(); + final Token reusableToken = new Token(); + for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) { + String word = token.term(); // ignore opt stop words if ( stop != null && stop.contains( word)) continue; Index: contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java =================================================================== --- contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (revision 684150) +++ contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (working copy) @@ -104,18 +104,20 @@ { if(f.queryString==null) return; TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString)); - Token token=ts.next(); + final Token reusableToken = new Token(); + Token token = ts.next(reusableToken); int corpusNumDocs=reader.numDocs(); Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects HashSet processedTerms=new HashSet(); while(token!=null) - { - if(!processedTerms.contains(token.termText())) + { + String term = token.term(); + if(!processedTerms.contains(term)) { - processedTerms.add(token.termText()); + processedTerms.add(term); ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore=0; - Term startTerm=internSavingTemplateTerm.createTerm(token.termText()); + Term startTerm=internSavingTemplateTerm.createTerm(term); FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength); TermEnum origEnum = reader.terms(startTerm); int df=0; @@ -162,8 +164,8 @@ q.insert(st); } } - token=ts.next(); - } + token=ts.next(reusableToken); + } } public Query rewrite(IndexReader reader) throws IOException