Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 684150)
+++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy)
@@ -141,34 +141,34 @@
private final class TestFilter extends TokenFilter {
- private org.apache.lucene.analysis.Token prevToken;
+ private Token prevToken;
public TestFilter(TokenStream in) {
super(in);
}
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
if (multiToken > 0) {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token("multi"+(multiToken+1), prevToken.startOffset(),
- prevToken.endOffset(), prevToken.type());
+ token.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type());
token.setPositionIncrement(0);
multiToken--;
return token;
} else {
- org.apache.lucene.analysis.Token t = input.next();
- prevToken = t;
- if (t == null)
+ token = input.next(token);
+ if (token == null) {
+ prevToken = null;
return null;
- String text = t.termText();
+ }
+ prevToken = (Token) token.clone();
+ String text = token.term();
if (text.equals("triplemulti")) {
multiToken = 2;
- return t;
+ return token;
} else if (text.equals("multi")) {
multiToken = 1;
- return t;
+ return token;
} else {
- return t;
+ return token;
}
}
}
@@ -197,20 +197,14 @@
super(in);
}
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
- for (Token t = input.next(); t != null; t = input.next()) {
- if (t.termText().equals("the")) {
+ public final Token next(Token token) throws java.io.IOException {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ if (token.term().equals("the")) {
// stopword, do nothing
- } else if (t.termText().equals("quick")) {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(),
- t.endOffset(), t.type());
+ } else if (token.term().equals("quick")) {
token.setPositionIncrement(2);
return token;
} else {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(),
- t.endOffset(), t.type());
token.setPositionIncrement(1);
return token;
}
Index: src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (revision 684150)
+++ src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (working copy)
@@ -319,7 +319,7 @@
}
private static class EmptyTokenStream extends TokenStream {
- public Token next() {
+ public Token next(Token token) {
return null;
}
}
Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 684150)
+++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy)
@@ -75,18 +75,18 @@
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (inPhrase) {
inPhrase = false;
- return new Token("phrase2", savedStart, savedEnd);
+ return token.reinit("phrase2", savedStart, savedEnd);
} else
- for (Token token = input.next(); token != null; token = input.next()) {
- if (token.termText().equals("phrase")) {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ if (token.term().equals("phrase")) {
inPhrase = true;
savedStart = token.startOffset();
savedEnd = token.endOffset();
- return new Token("phrase1", savedStart, savedEnd);
- } else if (!token.termText().equals("stop"))
+ return token.reinit("phrase1", savedStart, savedEnd);
+ } else if (!token.term().equals("stop"))
return token;
}
return null;
Index: src/test/org/apache/lucene/analysis/TestToken.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestToken.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestToken.java (working copy)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.*;
import org.apache.lucene.util.LuceneTestCase;
public class TestToken extends LuceneTestCase {
@@ -26,6 +25,119 @@
super(name);
}
+ public void testCtor() throws Exception {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ char[] buf = t.termBuffer();
+ assertNotSame(t.termBuffer(), content);
+ assertEquals("hello", t.term());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22, 7);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals(7, t.getFlags());
+
+ t = new Token(6, 22, "junk");
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22,type=junk)", t.toString());
+ assertEquals(0, t.getFlags());
+ }
+
+ public void testResize() {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ for (int i = 0; i < 2000; i++)
+ {
+ t.resizeTermBuffer(i);
+ assertTrue(i <= t.termBuffer().length);
+ assertEquals("hello", t.term());
+ }
+ }
+
+ public void testGrow() {
+ Token t = new Token();
+ StringBuffer buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ char[] content = buf.toString().toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals(buf.length(), t.termLength());
+ assertEquals(buf.toString(), t.term());
+ buf.append(buf.toString());
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, first variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content, 0, content.length());
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, second variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+ }
+
public void testToString() throws Exception {
char[] b = {'a', 'l', 'o', 'h', 'a'};
Token t = new Token("", 0, 5);
@@ -40,10 +152,10 @@
Token t = new Token("hello", 0, 5);
assertEquals(t.termText(), "hello");
assertEquals(t.termLength(), 5);
- assertEquals(new String(t.termBuffer(), 0, 5), "hello");
+ assertEquals(t.term(), "hello");
t.setTermText("hello2");
assertEquals(t.termLength(), 6);
- assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
+ assertEquals(t.term(), "hello2");
t.setTermBuffer("hello3".toCharArray(), 0, 6);
assertEquals(t.termText(), "hello3");
@@ -53,4 +165,13 @@
buffer[1] = 'o';
assertEquals(t.termText(), "hollo3");
}
+
+ public void testClone() throws Exception {
+ Token t = new Token(0, 5);
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, 5);
+ char[] buf = t.termBuffer();
+ Token copy = (Token) t.clone();
+ assertNotSame(buf, copy.termBuffer());
+ }
}
Index: src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (working copy)
@@ -29,16 +29,16 @@
TokenStream tokenStream = analyzer.tokenStream("field",
new StringReader(text));
- Token token = tokenStream.next();
+ Token token = tokenStream.next(new Token());
assertEquals("WhitespaceAnalyzer does not lowercase",
"Qwerty",
- token.termText());
+ token.term());
tokenStream = analyzer.tokenStream("special",
new StringReader(text));
- token = tokenStream.next();
+ token = tokenStream.next(token);
assertEquals("SimpleAnalyzer lowercases",
"qwerty",
- token.termText());
+ token.term());
}
}
Index: src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
===================================================================
--- src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (working copy)
@@ -16,10 +16,10 @@
* limitations under the License.
*/
-import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.io.StringReader;
@@ -29,7 +29,7 @@
/**
* tests for the TeeTokenFilter and SinkTokenizer
*/
-public class TeeSinkTokenTest extends TestCase {
+public class TeeSinkTokenTest extends LuceneTestCase {
protected StringBuffer buffer1;
protected StringBuffer buffer2;
protected String[] tokens1;
@@ -63,23 +63,23 @@
SinkTokenizer sink1 = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("The")) {
+ if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
}
}
};
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
- Token token = null;
int i = 0;
- while ((token = source.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+ final Token reusableToken = new Token();
+ for (Token token = source.next(reusableToken); token != null; token = source.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
i = 0;
- while ((token = sink1.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+ for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
@@ -88,54 +88,54 @@
public void testMultipleSources() throws Exception {
SinkTokenizer theDetector = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("The")) {
+ if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
}
}
};
SinkTokenizer dogDetector = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("Dogs")) {
+ if (t != null && t.term().equalsIgnoreCase("Dogs")) {
super.add(t);
}
}
};
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
- Token token = null;
int i = 0;
- while ((token = source1.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+ final Token reusableToken = new Token();
+ for (Token token = source1.next(reusableToken); token != null; token = source1.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
i = 0;
- while ((token = source2.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
+ for (Token token = source2.next(reusableToken); token != null; token = source2.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + tokens2[i], token.term().equals(tokens2[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
i = 0;
- while ((token = theDetector.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+ for (Token token = theDetector.next(reusableToken); token != null; token = theDetector.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
i = 0;
- while ((token = dogDetector.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
+ for (Token token = dogDetector.next(reusableToken); token != null; token = dogDetector.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + "Dogs", token.term().equalsIgnoreCase("Dogs") == true);
i++;
}
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
source1.reset();
TokenStream lowerCasing = new LowerCaseFilter(source1);
i = 0;
- while ((token = lowerCasing.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
+ for (Token token = lowerCasing.next(reusableToken); token != null; token = lowerCasing.next(reusableToken)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i].toLowerCase(), token.term().equals(tokens1[i].toLowerCase()) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
@@ -157,14 +157,14 @@
}
//make sure we produce the same tokens
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
- Token next = new Token();
+ final Token reusableToken = new Token();
TokenStream result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
- while ((next = result.next(next)) != null) {
+ while (result.next(reusableToken) != null) {
}
result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
- next = new Token();
List tmp = new ArrayList();
- while ((next = result.next(next)) != null) {
+ Token next;
+ while ((next = result.next(reusableToken)) != null) {
tmp.add(next.clone());
}
List sinkList = sink.getTokens();
@@ -172,7 +172,7 @@
for (int i = 0; i < tmp.size(); i++) {
Token tfTok = (Token) tmp.get(i);
Token sinkTok = (Token) sinkList.get(i);
- assertTrue(tfTok.termText() + " is not equal to " + sinkTok.termText() + " at token: " + i, tfTok.termText().equals(sinkTok.termText()) == true);
+ assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
@@ -180,14 +180,12 @@
int tfPos = 0;
long start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
- next = new Token();
result = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
- while ((next = result.next(next)) != null) {
+ while ((next = result.next(reusableToken)) != null) {
tfPos += next.getPositionIncrement();
}
- next = new Token();
result = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
- while ((next = result.next(next)) != null) {
+ while ((next = result.next(reusableToken)) != null) {
tfPos += next.getPositionIncrement();
}
}
@@ -198,14 +196,13 @@
start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
- next = new Token();
result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
- while ((next = result.next(next)) != null) {
+ while ((next = result.next(reusableToken)) != null) {
sinkPos += next.getPositionIncrement();
}
//System.out.println("Modulo--------");
result = sink;
- while ((next = result.next(next)) != null) {
+ while ((next = result.next(reusableToken)) != null) {
sinkPos += next.getPositionIncrement();
}
}
@@ -254,7 +251,7 @@
public void add(Token t) {
if (t != null && count % modCount == 0) {
- lst.add(t.clone());
+ super.add(t);
}
count++;
}
Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy)
@@ -42,11 +42,11 @@
TokenStream stream = new TokenStream() {
private int index = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (index == tokens.length) {
return null;
} else {
- return new Token(tokens[index++], 0, 0);
+ return token.reinit(tokens[index++], 0, 0);
}
}
@@ -91,10 +91,10 @@
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
- Token token;
- while ((token = stream.next()) != null) {
+ final Token reusableToken = new Token();
+ for (Token token = stream.next(reusableToken); token != null; token = stream.next(token)) {
assertTrue(count < tokens.length);
- assertEquals(tokens[count], token.termText());
+ assertEquals(tokens[count], token.term());
count++;
}
Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy)
@@ -35,10 +35,11 @@
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ Token t = new Token();
for (int i = 0; i < expectedImages.length; i++) {
- Token t = ts.next();
+ t = ts.next(t);
assertNotNull(t);
- assertEquals(expectedImages[i], t.termText());
+ assertEquals(expectedImages[i], t.term());
if (expectedTypes != null) {
assertEquals(expectedTypes[i], t.type());
}
@@ -46,7 +47,7 @@
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
}
}
- assertNull(ts.next());
+ assertNull(ts.next(t));
ts.close();
}
Index: src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (working copy)
@@ -25,81 +25,82 @@
public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
- assertEquals("Des", filter.next().termText());
- assertEquals("mot", filter.next().termText());
- assertEquals("cles", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("LA", filter.next().termText());
- assertEquals("CHAINE", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("AE", filter.next().termText());
- assertEquals("C", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("IJ", filter.next().termText());
- assertEquals("D", filter.next().termText());
- assertEquals("N", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("OE", filter.next().termText());
- assertEquals("TH", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("Y", filter.next().termText());
- assertEquals("Y", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("ae", filter.next().termText());
- assertEquals("c", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("ij", filter.next().termText());
- assertEquals("d", filter.next().termText());
- assertEquals("n", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("oe", filter.next().termText());
- assertEquals("ss", filter.next().termText());
- assertEquals("th", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("y", filter.next().termText());
- assertEquals("y", filter.next().termText());
- assertEquals("fi", filter.next().termText());
- assertEquals("fl", filter.next().termText());
- assertNull(filter.next());
+ Token token = new Token();
+ assertEquals("Des", filter.next(token).term());
+ assertEquals("mot", filter.next(token).term());
+ assertEquals("cles", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("LA", filter.next(token).term());
+ assertEquals("CHAINE", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("AE", filter.next(token).term());
+ assertEquals("C", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("IJ", filter.next(token).term());
+ assertEquals("D", filter.next(token).term());
+ assertEquals("N", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("OE", filter.next(token).term());
+ assertEquals("TH", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("Y", filter.next(token).term());
+ assertEquals("Y", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("ae", filter.next(token).term());
+ assertEquals("c", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("ij", filter.next(token).term());
+ assertEquals("d", filter.next(token).term());
+ assertEquals("n", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("oe", filter.next(token).term());
+ assertEquals("ss", filter.next(token).term());
+ assertEquals("th", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("y", filter.next(token).term());
+ assertEquals("y", filter.next(token).term());
+ assertEquals("fi", filter.next(token).term());
+ assertEquals("fl", filter.next(token).term());
+ assertNull(filter.next(token));
}
}
Index: src/test/org/apache/lucene/analysis/TestLengthFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy)
@@ -27,10 +27,11 @@
TokenStream stream = new WhitespaceTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(stream, 2, 6);
- assertEquals("short", filter.next().termText());
- assertEquals("ab", filter.next().termText());
- assertEquals("foo", filter.next().termText());
- assertNull(filter.next());
+ Token token = new Token();
+ assertEquals("short", filter.next(token).term());
+ assertEquals("ab", filter.next(token).term());
+ assertEquals("foo", filter.next(token).term());
+ assertNull(filter.next(token));
}
}
Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 684150)
+++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy)
@@ -17,13 +17,14 @@
* limitations under the License.
*/
-import java.io.*;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.LinkedList;
import java.util.List;
-import java.util.LinkedList;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.index.Payload;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
public class TestAnalyzers extends LuceneTestCase {
@@ -35,12 +36,14 @@
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ final Token reusableToken = new Token();
+ Token t;
for (int i=0; i
This is an abstract class.
- NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ NOTE: subclasses must override {@link #next(Token)}. It's
+ also OK to instead override {@link #next()} but that
+ method is now deprecated in favor of {@link #next(Token)}.
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 684150)
+++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
@@ -46,9 +46,10 @@
}
public final Token next(Token result) throws IOException {
+ assert result != null;
result = input.next(result);
if (result != null) {
- if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+ if (stemmer.stem(result.termBuffer(), 0, result.termLength()))
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return result;
} else
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 684150)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -39,6 +39,7 @@
}
public Token next(Token result) throws IOException {
+ assert result != null;
if (!done) {
done = true;
int upto = 0;
@@ -51,7 +52,7 @@
if (upto == buffer.length)
buffer = result.resizeTermBuffer(1+buffer.length);
}
- result.termLength = upto;
+ result.setTermLength(upto);
return result;
}
return null;
Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 684150)
+++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy)
@@ -39,6 +39,7 @@
* Removes dots from acronyms.
*/
public final Token next(Token result) throws java.io.IOException {
+ assert result != null;
Token t = input.next(result);
if (t == null)
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 684150)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
@@ -133,6 +133,7 @@
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
+ assert result != null;
int posIncr = 1;
while(true) {
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 684150)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -19,8 +19,9 @@
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
+import org.apache.lucene.util.ArrayUtil;
-/** A Token is an occurence of a term from the text of a field. It consists of
+/** A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -29,7 +30,7 @@
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
- The type is an interned string, assigned by a lexical analyzer
+ The type is a string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
@@ -49,7 +50,7 @@
NOTE: As of 2.3, Token stores the term text
internally as a malleable char[] termBuffer instead of
String termText. The indexing code and core tokenizers
- have been changed re-use a single Token instance, changing
+ have been changed to re-use a single Token instance, changing
its buffer and other fields in-place as the Token is
processed. This provides substantially better indexing
performance as it saves the GC cost of new'ing a Token and
@@ -62,14 +63,55 @@
instance when possible for best performance, by
implementing the {@link TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
- one of the constructors that starts with null text. Then
- you should call either {@link #termBuffer()} or {@link
- #resizeTermBuffer(int)} to retrieve the Token's
- termBuffer. Fill in the characters of your term into this
- buffer, and finally call {@link #setTermLength(int)} to
+ one of the constructors that starts with null text. To load
+ the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
+ To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}.
+ Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
+ if you know that your text is shorter than the capacity of the termBuffer
+ or {@link #resizeTermBuffer(int)}, if there is any possibility
+ that you may need to grow the buffer. Fill in the characters of your term into this
+ buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
+ or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
set the length of the term text. See LUCENE-969
for details. Typical reuse patterns:
+
+
+ A couple of things to note:
+
+
+ return reusableToken.reinit(string, startOffset, endOffset[, type]);
+
+
+
+ return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
+
+
+
+ return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
+
+
+
+ return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
+
+
+
+ return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
+
+
+
+ TokenStreams can be chained, one cannot assume that the Token's current type is correct.
- * To store payloads in the index a {@link TokenStream} has to be used that - * produces {@link Token}s containing payload data. - *
- * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
- * to retrieve the payloads from the index.
- *
- */
- public class Payload implements Serializable, Cloneable {
- /** the byte array containing the payload data */
- protected byte[] data;
+/**
+ * A Payload is metadata that can be stored together with each occurrence
+ * of a term. This metadata is stored inline in the posting list of the
+ * specific term.
+ *
+ * To store payloads in the index a {@link TokenStream} has to be used that + * produces {@link Token}s containing payload data. + *
+ * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
+ * to retrieve the payloads from the index. A ShingleFilter constructs shingles (token n-grams) from a token stream.
* In other words, it creates combinations of tokens as a single token.
@@ -299,6 +306,7 @@
private Matrix matrix;
public Token next(Token token) throws IOException {
+ assert token != null;
if (matrix == null) {
matrix = new Matrix();
// fill matrix with maximumShingleSize columns
@@ -340,14 +348,14 @@
}
// shingle token factory
- StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future.
+ StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
for (Token shingleToken : shingle) {
if (spacerCharacter != null && sb.length() > 0) {
sb.append(spacerCharacter);
}
sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
}
- token.setTermText(sb.toString());
+ token.setTermBuffer(sb.toString());
updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
return token;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy)
@@ -18,8 +18,11 @@
*/
import java.util.Hashtable;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
/**
* Title: ChineseFilter
* Description: Filter with a stop word table
@@ -61,10 +64,11 @@
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
}
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
+ assert token != null;
- for (Token token = input.next(); token != null; token = input.next()) {
- String text = token.termText();
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ String text = token.term();
// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) {
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy)
@@ -19,9 +19,11 @@
import java.io.Reader;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
/**
* Title: ChineseTokenizer
* Description: Extract tokens from the Stream using Character.getType()
@@ -75,17 +77,19 @@
}
- private final Token flush() {
+ private final Token flush(Token token) {
if (length>0) {
- //System.out.println(new String(buffer, 0, length));
- return new Token(new String(buffer, 0, length), start, start+length);
+ //System.out.println(new String(buffer, 0,
+ //length));
+ return token.reinit(buffer, 0, length, start, start+length);
}
else
return null;
}
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
+ assert token != null;
length = 0;
start = offset;
@@ -101,7 +105,7 @@
bufferIndex = 0;
}
- if (dataLen == -1) return flush();
+ if (dataLen == -1) return flush(token);
else
c = ioBuffer[bufferIndex++];
@@ -112,20 +116,20 @@
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
- if (length == MAX_WORD_LEN) return flush();
+ if (length == MAX_WORD_LEN) return flush(token);
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
- return flush();
+ return flush(token);
}
push(c);
- return flush();
+ return flush(token);
default:
- if (length>0) return flush();
+ if (length>0) return flush(token);
break;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy)
@@ -35,25 +35,20 @@
this.charset = charset;
}
- public final Token next() throws java.io.IOException
+ public final Token next(Token token) throws java.io.IOException
{
- Token t = input.next();
+ assert token != null;
+ token = input.next(token);
- if (t == null)
+ if (token == null)
return null;
- String txt = t.termText();
-
- char[] chArray = txt.toCharArray();
- for (int i = 0; i < chArray.length; i++)
+ char[] chArray = token.termBuffer();
+ int chLen = token.termLength();
+ for (int i = 0; i < chLen; i++)
{
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
}
-
- String newTxt = new String(chArray);
- // create new token
- Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
-
- return newToken;
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy)
@@ -36,7 +36,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private BrazilianStemmer stemmer = null;
private Set exclusions = null;
@@ -53,22 +52,23 @@
/**
* @return Returns the next token in the stream, or null at EOS.
*/
- public final Token next()
+ public final Token next(Token token)
throws IOException {
- if ((token = input.next()) == null) {
+ assert token != null;
+ if ((token = input.next(token)) == null) {
return null;
}
- // Check the exclusiontable.
- else if (exclusions != null && exclusions.contains(token.termText())) {
- return token;
- } else {
- String s = stemmer.stem(token.termText());
- // If not stemmed, dont waste the time creating a new token.
- if ((s != null) && !s.equals(token.termText())) {
- return new Token(s, token.startOffset(), token.endOffset(), token.type());
- }
- return token;
+
+ String term = token.term();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ token.setTermBuffer(s);
}
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy)
@@ -28,20 +28,22 @@
public class SingleTokenTokenStream extends TokenStream {
private boolean exhausted = false;
+ // The token needs to be immutable, so work with clones!
private Token token;
public SingleTokenTokenStream(Token token) {
- this.token = token;
+ this.token = (Token) token.clone();
}
public Token next(Token result) throws IOException {
+ assert token != null;
if (exhausted) {
return null;
}
exhausted = true;
- return token;
+ return (Token) token.clone();
}
@@ -50,10 +52,10 @@
}
public Token getToken() {
- return token;
+ return (Token) token.clone();
}
public void setToken(Token token) {
- this.token = token;
+ this.token = (Token) token.clone();
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (working copy)
@@ -46,6 +46,7 @@
private boolean prefixExhausted;
public Token next(Token result) throws IOException {
+ assert result != null;
Token buf = result;
@@ -124,7 +125,6 @@
if (source.termBuffer() != null) {
setTermBuffer(source.termBuffer(), 0, source.termLength());
} else {
- setTermText(null);
setTermLength(0);
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy)
@@ -27,18 +27,8 @@
*/
public class EmptyTokenStream extends TokenStream {
- public Token next() throws IOException {
- return null;
- }
-
public Token next(Token result) throws IOException {
+ assert result != null;
return null;
}
-
- public void reset() throws IOException {
- }
-
- public void close() throws IOException {
- }
-
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (working copy)
@@ -56,6 +56,7 @@
public Token next(Token result) throws IOException {
+ assert result != null;
return suffix.next(result);
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
@@ -64,7 +64,8 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
+ assert token != null;
if (!started) {
started = true;
gramSize = minGram;
@@ -82,9 +83,9 @@
if (pos+gramSize > inLen)
return null;
}
- String gram = inStr.substring(pos, pos+gramSize);
+
int oldPos = pos;
pos++;
- return new Token(gram, oldPos, oldPos+gramSize);
+ return token.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy)
@@ -115,15 +115,15 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
+ assert token != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
- Token token = input.next();
- if (token == null) {
+ token = input.next(token);
+ if (token == null)
return null;
- }
ngram(token);
if (ngrams.size() > 0)
@@ -133,12 +133,12 @@
}
private void ngram(Token token) {
- String inStr = token.termText();
- int inLen = inStr.length();
+ int termLength = token.termLength();
+ char[] termBuffer = token.termBuffer();
int gramSize = minGram;
while (gramSize <= maxGram) {
// if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen) {
+ if (gramSize > termLength) {
return;
}
@@ -147,13 +147,13 @@
return;
}
- Token tok;
- if (side == Side.FRONT) {
- tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
- }
- else {
- tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
- }
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : termLength - gramSize;
+ int end = start + gramSize;
+ Token tok = (Token) token.clone();
+ tok.setStartOffset(start);
+ tok.setEndOffset(end);
+ tok.setTermBuffer(termBuffer, start, gramSize);
ngrams.add(tok);
gramSize++;
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy)
@@ -19,6 +19,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import java.io.IOException;
import java.io.Reader;
@@ -113,13 +114,14 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
+ assert token != null;
// if we are just starting, read the whole input
if (!started) {
started = true;
char[] chars = new char[1024];
input.read(chars);
- inStr = new String(chars).trim(); // remove any trailing empty strings
+ inStr = new String(chars).trim(); // remove any leading or trailing spaces
inLen = inStr.length();
gramSize = minGram;
}
@@ -134,15 +136,13 @@
return null;
}
- Token tok;
- if (side == Side.FRONT) {
- tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
- }
- else {
- tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
- }
-
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ token.setTermBuffer(inStr, start, gramSize);
+ token.setStartOffset(start);
+ token.setEndOffset(end);
gramSize++;
- return tok;
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy)
@@ -63,12 +63,13 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
+ assert token != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
- Token token = input.next();
+ token = input.next(token);
if (token == null) {
return null;
}
@@ -81,16 +82,13 @@
}
private void ngram(Token token) {
- String inStr = token.termText();
- int inLen = inStr.length();
+ char[] termBuffer = token.termBuffer();
+ int termLength = token.termLength();
int gramSize = minGram;
while (gramSize <= maxGram) {
int pos = 0; // reset to beginning of string
- while (pos+gramSize <= inLen) { // while there is input
- String gram = inStr.substring(pos, pos+gramSize);
- Token tok = new Token(gram, pos, pos+gramSize);
-// tok.setPositionIncrement(pos);
- ngrams.add(tok);
+ while (pos+gramSize <= termLength) { // while there is input
+ ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize));
pos++;
}
gramSize++; // increase n-gram size
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy)
@@ -26,7 +26,7 @@
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
* most European languages. It performs other token methods for double-byte
- * Characters: the token will return at each two charactors with overlap match.
+ *
+ */
+public class Payload implements Serializable, Cloneable {
+ /** the byte array containing the payload data */
+ protected byte[] data;
- /** the offset within the byte array */
- protected int offset;
+ /** the offset within the byte array */
+ protected int offset;
- /** the length of the payload data */
- protected int length;
+ /** the length of the payload data */
+ protected int length;
- /** Creates an empty payload and does not allocate a byte array. */
- public Payload() {
- // nothing to do
- }
+ /** Creates an empty payload and does not allocate a byte array. */
+ public Payload() {
+ // nothing to do
+ }
- /**
- * Creates a new payload with the the given array as data.
- * A reference to the passed-in array is held, i. e. no
- * copy is made.
- *
- * @param data the data of this payload
- */
- public Payload(byte[] data) {
- this(data, 0, data.length);
- }
+ /**
+ * Creates a new payload with the the given array as data.
+ * A reference to the passed-in array is held, i. e. no
+ * copy is made.
+ *
+ * @param data the data of this payload
+ */
+ public Payload(byte[] data) {
+ this(data, 0, data.length);
+ }
- /**
- * Creates a new payload with the the given array as data.
- * A reference to the passed-in array is held, i. e. no
- * copy is made.
- *
- * @param data the data of this payload
- * @param offset the offset in the data byte array
- * @param length the length of the data
- */
- public Payload(byte[] data, int offset, int length) {
- if (offset < 0 || offset + length > data.length) {
- throw new IllegalArgumentException();
- }
- this.data = data;
- this.offset = offset;
- this.length = length;
+ /**
+ * Creates a new payload with the the given array as data.
+ * A reference to the passed-in array is held, i. e. no
+ * copy is made.
+ *
+ * @param data the data of this payload
+ * @param offset the offset in the data byte array
+ * @param length the length of the data
+ */
+ public Payload(byte[] data, int offset, int length) {
+ if (offset < 0 || offset + length > data.length) {
+ throw new IllegalArgumentException();
}
+ this.data = data;
+ this.offset = offset;
+ this.length = length;
+ }
- /**
- * Sets this payloads data.
- * A reference to the passed-in array is held, i. e. no
- * copy is made.
- */
- public void setData(byte[] data) {
- setData(data, 0, data.length);
- }
+ /**
+ * Sets this payloads data.
+ * A reference to the passed-in array is held, i. e. no
+ * copy is made.
+ */
+ public void setData(byte[] data) {
+ setData(data, 0, data.length);
+ }
- /**
- * Sets this payloads data.
- * A reference to the passed-in array is held, i. e. no
- * copy is made.
- */
- public void setData(byte[] data, int offset, int length) {
- this.data = data;
- this.offset = offset;
- this.length = length;
- }
+ /**
+ * Sets this payloads data.
+ * A reference to the passed-in array is held, i. e. no
+ * copy is made.
+ */
+ public void setData(byte[] data, int offset, int length) {
+ this.data = data;
+ this.offset = offset;
+ this.length = length;
+ }
- /**
- * Returns a reference to the underlying byte array
- * that holds this payloads data.
- */
- public byte[] getData() {
- return this.data;
- }
+ /**
+ * Returns a reference to the underlying byte array
+ * that holds this payloads data.
+ */
+ public byte[] getData() {
+ return this.data;
+ }
- /**
- * Returns the offset in the underlying byte array
- */
- public int getOffset() {
- return this.offset;
- }
+ /**
+ * Returns the offset in the underlying byte array
+ */
+ public int getOffset() {
+ return this.offset;
+ }
- /**
- * Returns the length of the payload data.
- */
- public int length() {
- return this.length;
- }
+ /**
+ * Returns the length of the payload data.
+ */
+ public int length() {
+ return this.length;
+ }
- /**
- * Returns the byte at the given index.
- */
- public byte byteAt(int index) {
- if (0 <= index && index < this.length) {
- return this.data[this.offset + index];
- }
- throw new ArrayIndexOutOfBoundsException(index);
+ /**
+ * Returns the byte at the given index.
+ */
+ public byte byteAt(int index) {
+ if (0 <= index && index < this.length) {
+ return this.data[this.offset + index];
}
+ throw new ArrayIndexOutOfBoundsException(index);
+ }
- /**
- * Allocates a new byte array, copies the payload data into it and returns it.
- */
- public byte[] toByteArray() {
- byte[] retArray = new byte[this.length];
- System.arraycopy(this.data, this.offset, retArray, 0, this.length);
- return retArray;
- }
+ /**
+ * Allocates a new byte array, copies the payload data into it and returns it.
+ */
+ public byte[] toByteArray() {
+ byte[] retArray = new byte[this.length];
+ System.arraycopy(this.data, this.offset, retArray, 0, this.length);
+ return retArray;
+ }
- /**
- * Copies the payload data to a byte array.
- *
- * @param target the target byte array
- * @param targetOffset the offset in the target byte array
- */
- public void copyTo(byte[] target, int targetOffset) {
- if (this.length > target.length + targetOffset) {
- throw new ArrayIndexOutOfBoundsException();
- }
- System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
+ /**
+ * Copies the payload data to a byte array.
+ *
+ * @param target the target byte array
+ * @param targetOffset the offset in the target byte array
+ */
+ public void copyTo(byte[] target, int targetOffset) {
+ if (this.length > target.length + targetOffset) {
+ throw new ArrayIndexOutOfBoundsException();
}
+ System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
+ }
- /**
- * Clones this payload by creating a copy of the underlying
- * byte array.
- */
- public Object clone() {
- Payload clone = new Payload(this.toByteArray());
- return clone;
- }
+ /**
+ * Clones this payload by creating a copy of the underlying
+ * byte array.
+ */
+ public Object clone() {
+ Payload clone = new Payload(this.toByteArray());
+ return clone;
+ }
+
+ public boolean equals(Object obj) {
+ if (obj == this)
+ return true;
+ if (obj instanceof Payload) {
+ Payload other = (Payload) obj;
+ if (length == other.length) {
+ for(int i=0;i
+ * Characters: the token will return at each two characters with overlap match.
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
* also need filter filter zero length token ""
* for Digit: digit, '+', '#' will token as letter
@@ -96,24 +96,26 @@
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
+ * @param token a reusable token
* @return Token
*
* @throws java.io.IOException - throw IOException when read error
- * hanppened in the InputStream
+ * happened in the InputStream
*
*/
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
+ assert token != null;
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) {
- /** current charactor */
+ /** current character */
char c;
- /** unicode block of current charactor for detail */
+ /** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
@@ -198,7 +200,7 @@
}
}
} else {
- // non-ASCII letter, eg."C1C2C3C4"
+ // non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
@@ -236,8 +238,6 @@
}
}
- return new Token(new String(buffer, 0, length), start, start + length,
- tokenType
- );
+ return token.reinit(buffer, 0, length, start, start+length, tokenType);
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy)
@@ -37,12 +37,11 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private FrenchStemmer stemmer = null;
private Set exclusions = null;
public FrenchStemFilter( TokenStream in ) {
- super(in);
+ super(in);
stemmer = new FrenchStemmer();
}
@@ -55,23 +54,22 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public final Token next()
+ public final Token next(Token token)
throws IOException {
- if ( ( token = input.next() ) == null ) {
+ assert token != null;
+ if ( ( token = input.next(token) ) == null ) {
return null;
}
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
- return token;
+ String term = token.term();
+
+ // Check the exclusion table
+ if ( exclusions == null || !exclusions.contains( term ) ) {
+ String s = stemmer.stem( term );
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals( term ) )
+ token.setTermBuffer(s);
}
- else {
- String s = stemmer.stem( token.termText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.equals( token.termText() ) ) {
- return new Token( s, token.startOffset(), token.endOffset(), token.type());
- }
- return token;
- }
+ return token;
}
/**
* Set a alternative/custom FrenchStemmer for this filter.
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy)
@@ -38,7 +38,7 @@
public class ElisionFilter extends TokenFilter {
private Set articles = null;
- private static String apostrophes = "'’";
+ private static char[] apostrophes = {'\'', '’'};
public void setArticles(Set articles) {
this.articles = new HashSet();
@@ -74,25 +74,36 @@
}
/**
- * Returns the next input Token whith termText() without elisioned start
+ * Returns the next input Token with term() without elisioned start
*/
- public Token next() throws IOException {
- Token t = input.next();
- if (t == null)
+ public Token next(Token token) throws IOException {
+ assert token != null;
+ token = input.next(token);
+ if (token == null)
return null;
- String text = t.termText();
- System.out.println(text);
- int minPoz = -1;
- int poz;
- for (int i = 0; i < apostrophes.length(); i++) {
- poz = text.indexOf(apostrophes.charAt(i));
- if (poz != -1)
- minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz);
+
+ char[] termBuffer = token.termBuffer();
+ int termLength = token.termLength();
+
+ int minPoz = Integer.MAX_VALUE;
+ for (int i = 0; i < apostrophes.length; i++) {
+ char apos = apostrophes[i];
+ // The equivalent of String.indexOf(ch)
+ for (int poz = 0; poz < termLength ; poz++) {
+ if (termBuffer[poz] == apos) {
+ minPoz = Math.min(poz, minPoz);
+ break;
+ }
+ }
}
- if (minPoz != -1
- && articles.contains(text.substring(0, minPoz).toLowerCase()))
- text = text.substring(minPoz + 1);
- return new Token(text, t.startOffset(), t.endOffset(), t.type());
+
+ // An apostrophe has been found. If the prefix is an article strip it off.
+ if (minPoz != Integer.MAX_VALUE
+ && articles.contains(new String(token.termBuffer(), 0, minPoz).toLowerCase())) {
+ token.setTermBuffer(token.termBuffer(), minPoz + 1, token.termLength() - (minPoz + 1));
+ }
+
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy)
@@ -38,7 +38,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private DutchStemmer stemmer = null;
private Set exclusions = null;
@@ -48,7 +47,7 @@
}
/**
- * Builds a DutchStemFilter that uses an exclusiontable.
+ * Builds a DutchStemFilter that uses an exclusion table.
*/
public DutchStemFilter(TokenStream _in, Set exclusiontable) {
this(_in);
@@ -66,23 +65,21 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public Token next() throws IOException {
- if ((token = input.next()) == null) {
+ public Token next(Token token) throws IOException {
+ assert token != null;
+ if ((token = input.next(token)) == null) {
return null;
}
+ String term = token.term();
- // Check the exclusiontable
- else if (exclusions != null && exclusions.contains(token.termText())) {
- return token;
- } else {
- String s = stemmer.stem(token.termText());
- // If not stemmed, dont waste the time creating a new token
- if (!s.equals(token.termText())) {
- return new Token(s, token.startOffset(),
- token.endOffset(), token.type());
- }
- return token;
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ token.setTermBuffer(s);
}
+ return token;
}
/**
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy)
@@ -40,31 +40,41 @@
breaker = BreakIterator.getWordInstance(new Locale("th"));
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
+ assert token != null;
if (thaiToken != null) {
- String text = thaiToken.termText();
int start = breaker.current();
int end = breaker.next();
if (end != BreakIterator.DONE) {
- return new Token(text.substring(start, end),
- thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
+ token.setTermBuffer(thaiToken.termBuffer(), start, end - start);
+ token.setStartOffset(thaiToken.startOffset()+start);
+ token.setEndOffset(thaiToken.endOffset()+end);
+ token.setType(thaiToken.type());
+ token.setPayload(thaiToken.getPayload());
+ token.setFlags(thaiToken.getFlags());
+ return token;
}
thaiToken = null;
}
- Token tk = input.next();
- if (tk == null) {
+
+ token = input.next(token);
+ if (token == null || token.termLength() == 0) {
return null;
}
- String text = tk.termText();
+
+ String text = token.term();
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
- return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
+ token.setTermBuffer(text.toLowerCase());
+ return token;
}
- thaiToken = tk;
+
+ thaiToken = (Token) token.clone();
breaker.setText(text);
int end = breaker.next();
if (end != BreakIterator.DONE) {
- return new Token(text.substring(0, end),
- thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
+ token.setTermBuffer(text, 0, end);
+ token.setEndOffset(token.startOffset() + end);
+ return token;
}
return null;
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy)
@@ -42,6 +42,7 @@
}
public Token next(Token result) throws IOException {
+ assert result != null;
result = input.next(result);
if (result != null && result.type().equals(typeMatch)){
result.setPayload(thePayload);
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy)
@@ -40,6 +40,7 @@
public Token next(Token result) throws IOException {
+ assert result != null;
result = input.next(result);
if (result != null && result.type() != null && result.type().equals("") == false){
result.setPayload(new Payload(result.type().getBytes("UTF-8")));
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 684150)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy)
@@ -39,6 +39,7 @@
}
public Token next(Token result) throws IOException {
+ assert result != null;
result = input.next(result);
if (result != null){
byte[] data = new byte[8];
Index: contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (revision 684150)
+++ contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@@ -808,10 +809,11 @@
throws IOException
{
TokenStream ts = analyzer.tokenStream(fieldName, r);
- org.apache.lucene.analysis.Token token;
int tokenCount=0;
- while ((token = ts.next()) != null) { // for every token
- String word = token.termText();
+ // for every token
+ final Token reusableToken = new Token();
+ for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) {
+ String word = token.term();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
@@ -872,7 +874,7 @@
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
- * @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
+ * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/
Index: contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (revision 684150)
+++ contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (working copy)
@@ -21,6 +21,7 @@
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -85,12 +86,11 @@
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
- org.apache.lucene.analysis.Token t;
BooleanQuery tmp = new BooleanQuery();
Set already = new HashSet(); // ignore dups
- while ( (t = ts.next()) != null)
- {
- String word = t.termText();
+ final Token reusableToken = new Token();
+ for (Token token = ts.next(reusableToken); token != null; token = ts.next(reusableToken)) {
+ String word = token.term();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
Index: contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (revision 684150)
+++ contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (working copy)
@@ -104,18 +104,20 @@
{
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
- Token token=ts.next();
+ final Token reusableToken = new Token();
+ Token token = ts.next(reusableToken);
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects
HashSet processedTerms=new HashSet();
while(token!=null)
- {
- if(!processedTerms.contains(token.termText()))
+ {
+ String term = token.term();
+ if(!processedTerms.contains(term))
{
- processedTerms.add(token.termText());
+ processedTerms.add(term);
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
- Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
+ Term startTerm=internSavingTemplateTerm.createTerm(term);
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
TermEnum origEnum = reader.terms(startTerm);
int df=0;
@@ -162,8 +164,8 @@
q.insert(st);
}
}
- token=ts.next();
- }
+ token=ts.next(reusableToken);
+ }
}
public Query rewrite(IndexReader reader) throws IOException