Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 683439)
+++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy)
@@ -141,34 +141,38 @@
private final class TestFilter extends TokenFilter {
- private org.apache.lucene.analysis.Token prevToken;
+ private Token prevToken;
public TestFilter(TokenStream in) {
super(in);
}
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
if (multiToken > 0) {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token("multi"+(multiToken+1), prevToken.startOffset(),
- prevToken.endOffset(), prevToken.type());
+ token.clear();
+ token.setTermBuffer("multi"+(multiToken+1));
+ token.setStartOffset(prevToken.startOffset());
+ token.setEndOffset(prevToken.endOffset());
+ token.setType(prevToken.type());
token.setPositionIncrement(0);
multiToken--;
return token;
} else {
- org.apache.lucene.analysis.Token t = input.next();
- prevToken = t;
- if (t == null)
+ token = input.next(token);
+ if (token == null) {
+ prevToken = null;
return null;
- String text = t.termText();
+ }
+ prevToken = (Token) token.clone();
+ String text = token.term();
if (text.equals("triplemulti")) {
multiToken = 2;
- return t;
+ return token;
} else if (text.equals("multi")) {
multiToken = 1;
- return t;
+ return token;
} else {
- return t;
+ return token;
}
}
}
@@ -197,20 +201,14 @@
super(in);
}
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
- for (Token t = input.next(); t != null; t = input.next()) {
- if (t.termText().equals("the")) {
+ public final Token next(Token token) throws java.io.IOException {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ if (token.term().equals("the")) {
// stopword, do nothing
- } else if (t.termText().equals("quick")) {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(),
- t.endOffset(), t.type());
+ } else if (token.term().equals("quick")) {
token.setPositionIncrement(2);
return token;
} else {
- org.apache.lucene.analysis.Token token =
- new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(),
- t.endOffset(), t.type());
token.setPositionIncrement(1);
return token;
}
Index: src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (revision 683439)
+++ src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java (working copy)
@@ -319,7 +319,7 @@
}
private static class EmptyTokenStream extends TokenStream {
- public Token next() {
+ public Token next(Token token) {
return null;
}
}
Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 683439)
+++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy)
@@ -75,19 +75,29 @@
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (inPhrase) {
inPhrase = false;
- return new Token("phrase2", savedStart, savedEnd);
+ token.clear();
+ token.setTermBuffer("phrase2");
+ token.setStartOffset(savedStart);
+ token.setEndOffset(savedEnd);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
} else
- for (Token token = input.next(); token != null; token = input.next()) {
- if (token.termText().equals("phrase")) {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ if (token.term().equals("phrase")) {
inPhrase = true;
savedStart = token.startOffset();
savedEnd = token.endOffset();
- return new Token("phrase1", savedStart, savedEnd);
- } else if (!token.termText().equals("stop"))
+ token.clear();
+ token.setTermBuffer("phrase1");
+ token.setStartOffset(savedStart);
+ token.setEndOffset(savedEnd);
+ token.setType(Token.DEFAULT_TYPE);
return token;
+ } else if (!token.term().equals("stop"))
+ return token;
}
return null;
}
Index: src/test/org/apache/lucene/analysis/TestToken.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestToken.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestToken.java (working copy)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.*;
import org.apache.lucene.util.LuceneTestCase;
public class TestToken extends LuceneTestCase {
@@ -26,6 +25,119 @@
super(name);
}
+ public void testCtor() throws Exception {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ char[] buf = t.termBuffer();
+ assertNotSame(t.termBuffer(), content);
+ assertEquals("hello", t.term());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22, 7);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals(7, t.getFlags());
+
+ t = new Token(6, 22, "junk");
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", t.term());
+ assertEquals("(hello,6,22,type=junk)", t.toString());
+ assertEquals(0, t.getFlags());
+ }
+
+ public void testResize() {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ for (int i = 0; i < 2000; i++)
+ {
+ t.resizeTermBuffer(i);
+ assertTrue(i <= t.termBuffer().length);
+ assertEquals("hello", t.term());
+ }
+ }
+
+ public void testGrow() {
+ Token t = new Token();
+ StringBuffer buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ char[] content = buf.toString().toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals(buf.length(), t.termLength());
+ assertEquals(buf.toString(), t.term());
+ buf.append(buf.toString());
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, first variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content, 0, content.length());
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, second variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+ }
+
public void testToString() throws Exception {
char[] b = {'a', 'l', 'o', 'h', 'a'};
Token t = new Token("", 0, 5);
@@ -40,10 +152,10 @@
Token t = new Token("hello", 0, 5);
assertEquals(t.termText(), "hello");
assertEquals(t.termLength(), 5);
- assertEquals(new String(t.termBuffer(), 0, 5), "hello");
+ assertEquals(t.term(), "hello");
t.setTermText("hello2");
assertEquals(t.termLength(), 6);
- assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
+ assertEquals(t.term(), "hello2");
t.setTermBuffer("hello3".toCharArray(), 0, 6);
assertEquals(t.termText(), "hello3");
@@ -53,4 +165,13 @@
buffer[1] = 'o';
assertEquals(t.termText(), "hollo3");
}
+
+ public void testClone() throws Exception {
+ Token t = new Token(0, 5);
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, 5);
+ char[] buf = t.termBuffer();
+ Token copy = (Token) t.clone();
+ assertNotSame(buf, copy.termBuffer());
+ }
}
Index: src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (working copy)
@@ -29,16 +29,16 @@
TokenStream tokenStream = analyzer.tokenStream("field",
new StringReader(text));
- Token token = tokenStream.next();
+ Token token = tokenStream.next(new Token());
assertEquals("WhitespaceAnalyzer does not lowercase",
"Qwerty",
- token.termText());
+ token.term());
tokenStream = analyzer.tokenStream("special",
new StringReader(text));
- token = tokenStream.next();
+ token = tokenStream.next(token);
assertEquals("SimpleAnalyzer lowercases",
"qwerty",
- token.termText());
+ token.term());
}
}
Index: src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
===================================================================
--- src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (working copy)
@@ -63,23 +63,22 @@
SinkTokenizer sink1 = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("The")) {
+ if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
}
}
};
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
- Token token = null;
int i = 0;
- while ((token = source.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+ for (Token token = source.next(new Token()); token != null; token = source.next(token)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
i = 0;
- while ((token = sink1.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+ for (Token token = sink1.next(new Token()); token != null; token = sink1.next(token)) {
+ assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
@@ -88,54 +87,53 @@
public void testMultipleSources() throws Exception {
SinkTokenizer theDetector = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("The")) {
+ if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
}
}
};
SinkTokenizer dogDetector = new SinkTokenizer(null) {
public void add(Token t) {
- if (t != null && t.termText().equalsIgnoreCase("Dogs")) {
+ if (t != null && t.term().equalsIgnoreCase("Dogs")) {
super.add(t);
}
}
};
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
- Token token = null;
int i = 0;
- while ((token = source1.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+ for (Token token = source1.next(new Token()); token != null; token = source1.next(token)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i], token.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
i = 0;
- while ((token = source2.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
+ for (Token token = source2.next(new Token()); token != null; token = source2.next(token)) {
+ assertTrue(token.term() + " is not equal to " + tokens2[i], token.term().equals(tokens2[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
i = 0;
- while ((token = theDetector.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+ for (Token token = theDetector.next(new Token()); token != null; token = theDetector.next(token)) {
+ assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
i = 0;
- while ((token = dogDetector.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
+ for (Token token = dogDetector.next(new Token()); token != null; token = dogDetector.next(token)) {
+ assertTrue(token.term() + " is not equal to " + "Dogs", token.term().equalsIgnoreCase("Dogs") == true);
i++;
}
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
source1.reset();
TokenStream lowerCasing = new LowerCaseFilter(source1);
i = 0;
- while ((token = lowerCasing.next()) != null) {
- assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
+ for (Token token = lowerCasing.next(new Token()); token != null; token = lowerCasing.next(token)) {
+ assertTrue(token.term() + " is not equal to " + tokens1[i].toLowerCase(), token.term().equals(tokens1[i].toLowerCase()) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
@@ -172,7 +170,7 @@
for (int i = 0; i < tmp.size(); i++) {
Token tfTok = (Token) tmp.get(i);
Token sinkTok = (Token) sinkList.get(i);
- assertTrue(tfTok.termText() + " is not equal to " + sinkTok.termText() + " at token: " + i, tfTok.termText().equals(sinkTok.termText()) == true);
+ assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
@@ -254,7 +252,7 @@
public void add(Token t) {
if (t != null && count % modCount == 0) {
- lst.add(t.clone());
+ super.add(t);
}
count++;
}
Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy)
@@ -42,11 +42,16 @@
TokenStream stream = new TokenStream() {
private int index = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (index == tokens.length) {
return null;
} else {
- return new Token(tokens[index++], 0, 0);
+ token.clear();
+ token.setTermBuffer(tokens[index++]);
+ token.setStartOffset(0);
+ token.setEndOffset(0);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
}
}
@@ -91,10 +96,9 @@
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
- Token token;
- while ((token = stream.next()) != null) {
+ for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) {
assertTrue(count < tokens.length);
- assertEquals(tokens[count], token.termText());
+ assertEquals(tokens[count], token.term());
count++;
}
Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy)
@@ -35,10 +35,11 @@
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ Token t = new Token();
for (int i = 0; i < expectedImages.length; i++) {
- Token t = ts.next();
+ t = ts.next(t);
assertNotNull(t);
- assertEquals(expectedImages[i], t.termText());
+ assertEquals(expectedImages[i], t.term());
if (expectedTypes != null) {
assertEquals(expectedTypes[i], t.type());
}
@@ -46,7 +47,7 @@
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
}
}
- assertNull(ts.next());
+ assertNull(ts.next(t));
ts.close();
}
Index: src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (working copy)
@@ -25,81 +25,82 @@
public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
- assertEquals("Des", filter.next().termText());
- assertEquals("mot", filter.next().termText());
- assertEquals("cles", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("LA", filter.next().termText());
- assertEquals("CHAINE", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("A", filter.next().termText());
- assertEquals("AE", filter.next().termText());
- assertEquals("C", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("E", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("I", filter.next().termText());
- assertEquals("IJ", filter.next().termText());
- assertEquals("D", filter.next().termText());
- assertEquals("N", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("O", filter.next().termText());
- assertEquals("OE", filter.next().termText());
- assertEquals("TH", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("U", filter.next().termText());
- assertEquals("Y", filter.next().termText());
- assertEquals("Y", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("a", filter.next().termText());
- assertEquals("ae", filter.next().termText());
- assertEquals("c", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("e", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("i", filter.next().termText());
- assertEquals("ij", filter.next().termText());
- assertEquals("d", filter.next().termText());
- assertEquals("n", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("o", filter.next().termText());
- assertEquals("oe", filter.next().termText());
- assertEquals("ss", filter.next().termText());
- assertEquals("th", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("u", filter.next().termText());
- assertEquals("y", filter.next().termText());
- assertEquals("y", filter.next().termText());
- assertEquals("fi", filter.next().termText());
- assertEquals("fl", filter.next().termText());
- assertNull(filter.next());
+ Token token = new Token();
+ assertEquals("Des", filter.next(token).term());
+ assertEquals("mot", filter.next(token).term());
+ assertEquals("cles", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("LA", filter.next(token).term());
+ assertEquals("CHAINE", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("A", filter.next(token).term());
+ assertEquals("AE", filter.next(token).term());
+ assertEquals("C", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("E", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("I", filter.next(token).term());
+ assertEquals("IJ", filter.next(token).term());
+ assertEquals("D", filter.next(token).term());
+ assertEquals("N", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("O", filter.next(token).term());
+ assertEquals("OE", filter.next(token).term());
+ assertEquals("TH", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("U", filter.next(token).term());
+ assertEquals("Y", filter.next(token).term());
+ assertEquals("Y", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("a", filter.next(token).term());
+ assertEquals("ae", filter.next(token).term());
+ assertEquals("c", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("e", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("i", filter.next(token).term());
+ assertEquals("ij", filter.next(token).term());
+ assertEquals("d", filter.next(token).term());
+ assertEquals("n", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("o", filter.next(token).term());
+ assertEquals("oe", filter.next(token).term());
+ assertEquals("ss", filter.next(token).term());
+ assertEquals("th", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("u", filter.next(token).term());
+ assertEquals("y", filter.next(token).term());
+ assertEquals("y", filter.next(token).term());
+ assertEquals("fi", filter.next(token).term());
+ assertEquals("fl", filter.next(token).term());
+ assertNull(filter.next(token));
}
}
Index: src/test/org/apache/lucene/analysis/TestLengthFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy)
@@ -27,10 +27,11 @@
TokenStream stream = new WhitespaceTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(stream, 2, 6);
- assertEquals("short", filter.next().termText());
- assertEquals("ab", filter.next().termText());
- assertEquals("foo", filter.next().termText());
- assertNull(filter.next());
+ Token token = new Token();
+ assertEquals("short", filter.next(token).term());
+ assertEquals("ab", filter.next(token).term());
+ assertEquals("foo", filter.next(token).term());
+ assertNull(filter.next(token));
}
}
Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 683439)
+++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy)
@@ -17,13 +17,14 @@
* limitations under the License.
*/
-import java.io.*;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.LinkedList;
import java.util.List;
-import java.util.LinkedList;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.index.Payload;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
public class TestAnalyzers extends LuceneTestCase {
@@ -35,12 +36,13 @@
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ Token t = new Token();
for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
stpf.setEnablePositionIncrements(enableIcrements);
+ Token t = new Token();
for (int i=0; i<20; i+=3) {
- Token t = stpf.next();
+ t = stpf.next(t);
log("Token "+i+": "+t);
String w = English.intToEnglish(i).trim();
- assertEquals("expecting token "+i+" to be "+w,w,t.termText());
+ assertEquals("expecting token "+i+" to be "+w,w,t.term());
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement());
}
- assertNull(stpf.next());
+ assertNull(stpf.next(t));
}
// print debug info depending on VERBOSE
Index: src/test/org/apache/lucene/AnalysisTest.java
===================================================================
--- src/test/org/apache/lucene/AnalysisTest.java (revision 683439)
+++ src/test/org/apache/lucene/AnalysisTest.java (working copy)
@@ -70,9 +70,9 @@
Date start = new Date();
int count = 0;
- for (Token t = stream.next(); t!=null; t = stream.next()) {
+ for (Token t = stream.next(new Token()); t!=null; t = stream.next(t)) {
if (verbose) {
- System.out.println("Text=" + new String(t.termBuffer(), 0, t.termLength())
+ System.out.println("Text=" + t.term()
+ " start=" + t.startOffset()
+ " end=" + t.endOffset());
}
Index: src/test/org/apache/lucene/search/TestPositionIncrement.java
===================================================================
--- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 683439)
+++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy)
@@ -49,13 +49,17 @@
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
private int i = 0;
- public Token next() {
+ public Token next(Token token) {
if (i == TOKENS.length)
return null;
- Token t = new Token(TOKENS[i], i, i);
- t.setPositionIncrement(INCREMENTS[i]);
+ token.clear();
+ token.setTermBuffer(TOKENS[i]);
+ token.setStartOffset(i);
+ token.setEndOffset(i);
+ token.setType(Token.DEFAULT_TYPE);
+ token.setPositionIncrement(INCREMENTS[i]);
i++;
- return t;
+ return token;
}
};
}
@@ -204,11 +208,8 @@
Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream("field",
new StringReader("one two three four five"));
-
- while (true) {
- Token token = ts.next();
- if (token == null) break;
- assertEquals(token.termText(), 1, token.getPositionIncrement());
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ assertEquals(token.term(), 1, token.getPositionIncrement());
}
}
}
Index: src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java
===================================================================
--- src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (revision 683439)
+++ src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (working copy)
@@ -16,22 +16,32 @@
* limitations under the License.
*/
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.*;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CheckHits;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
-import java.io.IOException;
-import java.io.Reader;
-
public class TestBoostingTermQuery extends LuceneTestCase {
private IndexSearcher searcher;
private BoostingSimilarity similarity = new BoostingSimilarity();
@@ -62,8 +72,8 @@
this.fieldName = fieldName;
}
- public Token next() throws IOException {
- Token result = input.next();
+ public Token next(Token token) throws IOException {
+ Token result = input.next(token);
if (result != null) {
if (fieldName.equals("field")) {
result.setPayload(new Payload(payloadField));
Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java
===================================================================
--- src/test/org/apache/lucene/index/TestTermVectorsReader.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestTermVectorsReader.java (working copy)
@@ -118,20 +118,21 @@
private class MyTokenStream extends TokenStream {
int tokenUpto;
- public Token next() {
+ public Token next(Token token) {
if (tokenUpto >= tokens.length)
return null;
else {
- final Token t = new Token();
final TestToken testToken = tokens[tokenUpto++];
- t.setTermText(testToken.text);
+ token.clear();
+ token.setTermBuffer(testToken.text);
if (tokenUpto > 1)
- t.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
+ token.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
else
- t.setPositionIncrement(testToken.pos+1);
- t.setStartOffset(testToken.startOffset);
- t.setEndOffset(testToken.endOffset);
- return t;
+ token.setPositionIncrement(testToken.pos+1);
+ token.setStartOffset(testToken.startOffset);
+ token.setEndOffset(testToken.endOffset);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
}
}
}
Index: src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -1786,11 +1786,11 @@
return new TokenFilter(new StandardTokenizer(reader)) {
private int count = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (count++ == 5) {
throw new IOException();
}
- return input.next();
+ return input.next(token);
}
};
}
@@ -3574,13 +3574,13 @@
public void testNegativePositions() throws Throwable {
SinkTokenizer tokens = new SinkTokenizer();
Token t = new Token();
- t.setTermText("a");
+ t.setTermBuffer("a");
t.setPositionIncrement(0);
tokens.add(t);
- t.setTermText("b");
+ t.setTermBuffer("b");
t.setPositionIncrement(1);
tokens.add(t);
- t.setTermText("c");
+ t.setTermBuffer("c");
tokens.add(t);
MockRAMDirectory dir = new MockRAMDirectory();
Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java
===================================================================
--- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy)
@@ -103,12 +103,12 @@
super(input);
}
- public Token next() throws IOException {
- Token t = input.next();
- if (t != null) {
- t.setPayload(new Payload(new byte[] { (byte) count++ }));
+ public Token next(Token token) throws IOException {
+ token = input.next(token);
+ if (token != null) {
+ token.setPayload(new Payload(new byte[] { (byte) count++ }));
}
- return t;
+ return token;
}
}
Index: src/test/org/apache/lucene/index/TestTermdocPerf.java
===================================================================
--- src/test/org/apache/lucene/index/TestTermdocPerf.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestTermdocPerf.java (working copy)
@@ -40,11 +40,12 @@
Token t;
public RepeatingTokenStream(String val) {
- t = new Token(val,0,val.length());
+ t = new Token(0,val.length());
+ t.setTermBuffer(val);
}
- public Token next() throws IOException {
- return --num<0 ? null : t;
+ public Token next(Token token) throws IOException {
+ return --num<0 ? null : (Token) t.clone();
}
}
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
@@ -17,21 +17,27 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.*;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
-import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
-import java.io.IOException;
-import java.io.Reader;
-
public class TestDocumentWriter extends LuceneTestCase {
private RAMDirectory dir;
@@ -134,10 +140,6 @@
boolean first=true;
Token buffered;
- public Token next() throws IOException {
- return input.next();
- }
-
public Token next(Token result) throws IOException {
if (buffered != null) {
Token t = buffered;
@@ -199,11 +201,16 @@
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
private int index = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (index == tokens.length) {
return null;
} else {
- return new Token(tokens[index++], 0, 0);
+ token.clear();
+ token.setTermBuffer(tokens[index++]);
+ token.setStartOffset(0);
+ token.setEndOffset(0);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
}
}
Index: src/test/org/apache/lucene/index/TestPayloads.java
===================================================================
--- src/test/org/apache/lucene/index/TestPayloads.java (revision 683439)
+++ src/test/org/apache/lucene/index/TestPayloads.java (working copy)
@@ -536,11 +536,15 @@
first = true;
}
- public Token next() throws IOException {
- if (!first) return null;
- Token t = new Token(term, 0, 0);
- t.setPayload(new Payload(payload));
- return t;
+ public Token next(Token token) throws IOException {
+ if (!first) return null;
+ token.clear();
+ token.setTermBuffer(term);
+ token.setStartOffset(0);
+ token.setEndOffset(0);
+ token.setType(Token.DEFAULT_TYPE);
+ token.setPayload(new Payload(payload));
+ return token;
}
public void close() throws IOException {
Index: src/java/org/apache/lucene/queryParser/QueryParser.java
===================================================================
--- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 683439)
+++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy)
@@ -1,14 +1,35 @@
/* Generated By:JavaCC: Do not edit this line. QueryParser.java */
package org.apache.lucene.queryParser;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
import java.util.Vector;
-import java.io.*;
-import java.text.*;
-import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreRangeQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RangeQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@@ -451,20 +472,20 @@
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
- org.apache.lucene.analysis.Token t;
+ org.apache.lucene.analysis.Token t = new org.apache.lucene.analysis.Token();
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
- t = source.next();
+ t = source.next(t);
}
catch (IOException e) {
t = null;
}
if (t == null)
break;
- v.addElement(t);
+ v.addElement(t.clone());
if (t.getPositionIncrement() != 0)
positionCount += t.getPositionIncrement();
else
@@ -481,7 +502,7 @@
return null;
else if (v.size() == 1) {
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
- return new TermQuery(new Term(field, t.termText()));
+ return new TermQuery(new Term(field, t.term()));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
@@ -490,7 +511,7 @@
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
TermQuery currentQuery = new TermQuery(
- new Term(field, t.termText()));
+ new Term(field, t.term()));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@@ -512,7 +533,7 @@
multiTerms.clear();
}
position += t.getPositionIncrement();
- multiTerms.add(new Term(field, t.termText()));
+ multiTerms.add(new Term(field, t.term()));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@@ -530,9 +551,9 @@
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (enablePositionIncrements) {
position += t.getPositionIncrement();
- pq.add(new Term(field, t.termText()),position);
+ pq.add(new Term(field, t.term()),position);
} else {
- pq.add(new Term(field, t.termText()));
+ pq.add(new Term(field, t.term()));
}
}
return pq;
Index: src/java/org/apache/lucene/queryParser/QueryParser.jj
===================================================================
--- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 683439)
+++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy)
@@ -25,14 +25,35 @@
package org.apache.lucene.queryParser;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
import java.util.Vector;
-import java.io.*;
-import java.text.*;
-import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreRangeQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RangeQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@@ -475,20 +496,20 @@
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
- org.apache.lucene.analysis.Token t;
+ org.apache.lucene.analysis.Token t = new org.apache.lucene.analysis.Token();
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
- t = source.next();
+ t = source.next(t);
}
catch (IOException e) {
t = null;
}
if (t == null)
break;
- v.addElement(t);
+ v.addElement(t.clone());
if (t.getPositionIncrement() != 0)
positionCount += t.getPositionIncrement();
else
@@ -505,7 +526,7 @@
return null;
else if (v.size() == 1) {
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
- return new TermQuery(new Term(field, t.termText()));
+ return new TermQuery(new Term(field, t.term()));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
@@ -514,7 +535,7 @@
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
TermQuery currentQuery = new TermQuery(
- new Term(field, t.termText()));
+ new Term(field, t.term()));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@@ -536,7 +557,7 @@
multiTerms.clear();
}
position += t.getPositionIncrement();
- multiTerms.add(new Term(field, t.termText()));
+ multiTerms.add(new Term(field, t.term()));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@@ -554,9 +575,9 @@
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (enablePositionIncrements) {
position += t.getPositionIncrement();
- pq.add(new Term(field, t.termText()),position);
+ pq.add(new Term(field, t.term()),position);
} else {
- pq.add(new Term(field, t.termText()));
+ pq.add(new Term(field, t.term()));
}
}
return pq;
Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy)
@@ -22,11 +22,11 @@
}
public SinkTokenizer() {
- this.lst = new ArrayList();
+ this.lst = new ArrayList/**/();
}
public SinkTokenizer(int initCap){
- this.lst = new ArrayList(initCap);
+ this.lst = new ArrayList/**/(initCap);
}
/**
@@ -35,6 +35,8 @@
* WARNING: Adding tokens to this list requires the {@link #reset()} method to be called in order for them
* to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s
* in the case of adds happening while {@link #next(org.apache.lucene.analysis.Token)} is being called.
+ *
+ * WARNING: Since this SinkTokenizer can be reset and the cached tokens made available again, do not modify them. Modify clones instead.
*
* @return A List of {@link org.apache.lucene.analysis.Token}s
*/
@@ -47,9 +49,14 @@
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
*/
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (iter == null) iter = lst.iterator();
- return iter.hasNext() ? (Token) iter.next() : null;
+ // Since this TokenStream can be reset we have to maintain the tokens as immutable
+ if (iter.hasNext()) {
+ token = (Token) iter.next();
+ return (Token) token.clone();
+ }
+ return null;
}
Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy)
@@ -40,11 +40,11 @@
super(input);
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
- fillCache();
+ fillCache(token);
iterator = cache.iterator();
}
@@ -52,8 +52,9 @@
// the cache is exhausted, return null
return null;
}
-
- return (Token) iterator.next();
+ // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+ Token t = (Token) iterator.next();
+ return (Token) t.clone();
}
public void reset() throws IOException {
@@ -62,10 +63,9 @@
}
}
- private void fillCache() throws IOException {
- Token token;
- while ( (token = input.next()) != null) {
- cache.add(token);
+ private void fillCache(Token token) throws IOException {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ cache.add(token.clone());
}
}
Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy)
@@ -81,9 +81,9 @@
break; // return 'em
}
- token.termLength = length;
- token.startOffset = start;
- token.endOffset = start+length;
+ token.setTermLength(length);
+ token.setStartOffset(start);
+ token.setEndOffset(start+length);
return token;
}
Index: src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy)
@@ -24,8 +24,9 @@
This is an abstract class.
- NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ NOTE: subclasses must override {@link #next(Token)}. It's
+ also OK to instead override {@link #next()} but that
+ method is now deprecated in favor of {@link #next(Token)}.
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
@@ -48,7 +48,7 @@
public final Token next(Token result) throws IOException {
result = input.next(result);
if (result != null) {
- if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+ if (stemmer.stem(result.termBuffer(), 0, result.termLength()))
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return result;
} else
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -51,7 +51,7 @@
if (upto == buffer.length)
buffer = result.resizeTermBuffer(1+buffer.length);
}
- result.termLength = upto;
+ result.setTermLength(upto);
return result;
}
return null;
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -19,8 +19,9 @@
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
+import org.apache.lucene.util.ArrayUtil;
-/** A Token is an occurence of a term from the text of a field. It consists of
+/** A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -29,7 +30,7 @@
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
- The type is an interned string, assigned by a lexical analyzer
+ The type is a string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
@@ -49,7 +50,7 @@
NOTE: As of 2.3, Token stores the term text
internally as a malleable char[] termBuffer instead of
String termText. The indexing code and core tokenizers
- have been changed re-use a single Token instance, changing
+ have been changed to re-use a single Token instance, changing
its buffer and other fields in-place as the Token is
processed. This provides substantially better indexing
performance as it saves the GC cost of new'ing a Token and
@@ -62,14 +63,79 @@
instance when possible for best performance, by
implementing the {@link TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
- one of the constructors that starts with null text. Then
- you should call either {@link #termBuffer()} or {@link
- #resizeTermBuffer(int)} to retrieve the Token's
- termBuffer. Fill in the characters of your term into this
- buffer, and finally call {@link #setTermLength(int)} to
+ one of the constructors that starts with null text. To load
+ the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
+ To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}.
+ Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
+ if you know that your text is shorter than the capacity of the termBuffer
+ or {@link #resizeTermBuffer(int)}, if there is any possibility
+ that you may need to grow the buffer. Fill in the characters of your term into this
+ buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
+ or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
set the length of the term text. See LUCENE-969
for details.
+ Typical reuse patterns:
+
+ - Copying text from a string:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying some text from a string:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string, 0, string.length() - 1);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying text from char[] buffer:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, 0, buffer.length);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying some text from a char[] buffer:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, start, end - start);
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+ - Copying from one one Token to another:
+
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength());
+ reusableToken.setStartOffset(startOffset);
+ reusableToken.setEndOffset(endOffset);
+ reusableToken.setType(Token.DEFAULT_TYPE);
+
+
+
+ A couple of things to note:
+
+ - clear() initializes most of the fields to default values, but not startOffset, endOffset and type.
+ - Because
TokenStreams can be chained, one cannot assume that the Token's current type is correct.
+ - The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.
+ - When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
+
+
@see org.apache.lucene.index.Payload
*/
@@ -83,16 +149,56 @@
* deprecated APIs */
private String termText;
- char[] termBuffer; // characters for the term text
- int termLength; // length of term text in buffer
+ /**
+ * Characters for the term text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link termBuffer()},
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ */
+ char[] termBuffer;
- int startOffset; // start in source text
- int endOffset; // end in source text
- String type = DEFAULT_TYPE; // lexical type
+ /**
+ * Length of term text in the buffer.
+ * @deprecated This will be made private. Instead, use:
+ * {@link termLength()}, or @{link setTermLength(int)}.
+ */
+ int termLength;
+
+ /**
+ * Start in source text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link startOffset()}, or @{link setStartOffset(int)}.
+ */
+ int startOffset;
+
+ /**
+ * End in source text.
+ * @deprecated This will be made private. Instead, use:
+ * {@link endOffset()}, or @{link setEndOffset(int)}.
+ */
+ int endOffset;
+
+ /**
+ * The lexical type of the token.
+ * @deprecated This will be made private. Instead, use:
+ * {@link type()}, or @{link setType(String)}.
+ */
+ String type = DEFAULT_TYPE;
+
private int flags;
+ /**
+ * @deprecated This will be made private. Instead, use:
+ * {@link getPayload()}, or @{link setPayload(Payload)}.
+ */
Payload payload;
+ /**
+ * @deprecated This will be made private. Instead, use:
+ * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}.
+ */
int positionIncrement = 1;
/** Constructs a Token will null text. */
@@ -101,8 +207,8 @@
/** Constructs a Token with null text and start & end
* offsets.
- * @param start start offset
- * @param end end offset */
+ * @param start start offset in the source text
+ * @param end end offset in the source text */
public Token(int start, int end) {
startOffset = start;
endOffset = end;
@@ -110,8 +216,9 @@
/** Constructs a Token with null text and start & end
* offsets plus the Token type.
- * @param start start offset
- * @param end end offset */
+ * @param start start offset in the source text
+ * @param end end offset in the source text
+ * @param type the lexical type of this Token */
public Token(int start, int end, String typ) {
startOffset = start;
endOffset = end;
@@ -120,10 +227,10 @@
/**
* Constructs a Token with null text and start & end
- * offsets plus the Token type.
- * @param start start offset
- * @param end end offset
- * @param flags The bits to set for this token
+ * offsets plus flags. NOTE: flags is EXPERIMENTAL.
+ * @param start start offset in the source text
+ * @param end end offset in the source text
+ * @param flags The bits to set for this token
*/
public Token(int start, int end, int flags){
startOffset = start;
@@ -138,7 +245,9 @@
* term text.
* @param text term text
* @param start start offset
- * @param end end offset */
+ * @param end end offset
+ * @deprecated
+ */
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
@@ -152,7 +261,9 @@
* @param text term text
* @param start start offset
* @param end end offset
- * @param typ token type */
+ * @param typ token type
+ * @deprecated
+ */
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
@@ -169,6 +280,7 @@
* @param start
* @param end
* @param flags token type bits
+ * @deprecated
*/
public Token(String text, int start, int end, int flags) {
termText = text;
@@ -200,6 +312,7 @@
* occur with no intervening stop words.
*
*
+ * @param positionIncrement the distance from the prior term
* @see org.apache.lucene.index.TermPositions
*/
public void setPositionIncrement(int positionIncrement) {
@@ -218,7 +331,11 @@
/** Sets the Token's term text. NOTE: for better
* indexing speed you should instead use the char[]
- * termBuffer methods to set the term text. */
+ * termBuffer methods to set the term text.
+ * @deprecated use {@link #setTermBuffer(char[], int, length)} or
+ * {@link #setTermBuffer(String)} or
+ * {@link #setTermBuffer(String, int, int)}.
+ */
public void setTermText(String text) {
termText = text;
termBuffer = null;
@@ -230,7 +347,7 @@
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
- * String, use new String(token.termBuffer(), 0, token.termLength())
+ * String, use {@link #term()}
*/
public final String termText() {
if (termText == null && termBuffer != null)
@@ -238,19 +355,68 @@
return termText;
}
+ /** Returns the Token's term text.
+ *
+ * This method has a performance penalty
+ * because the text is stored internally in a char[]. If
+ * possible, use {@link #termBuffer()} and {@link
+ * #termLength()} directly instead. If you really need a
+ * String, use this method, which is nothing more than
+ * a convenience call to new String(token.termBuffer(), 0, token.termLength())
+ */
+ public final String term() {
+ if (termText != null)
+ return termText;
+ initTermBuffer();
+ return new String(termBuffer, 0, termLength);
+ }
+
/** Copies the contents of buffer, starting at offset for
- * length characters, into the termBuffer
- * array. NOTE: for better indexing speed you
- * should instead retrieve the termBuffer, using {@link
- * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
- * fill it in directly to set the term text. This saves
- * an extra copy. */
+ * length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
public final void setTermBuffer(char[] buffer, int offset, int length) {
- resizeTermBuffer(length);
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
+ /** Copies the contents of buffer into the termBuffer array.
+ * @param buffer the buffer to copy
+ */
+ public final void setTermBuffer(String buffer) {
+ termText = null;
+ int length = buffer.length();
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /** Copies the contents of buffer, starting at offset and continuing
+ * for length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
+ public final void setTermBuffer(String buffer, int offset, int length) {
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
@@ -263,23 +429,69 @@
return termBuffer;
}
- /** Grows the termBuffer to at least size newSize.
+ /** Grows the termBuffer to at least size newSize, preserving the
+ * existing content. Note: If the next operation is to change
+ * the contents of the term buffer use
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ * to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
- initTermBuffer();
- if (newSize > termBuffer.length) {
- int size = termBuffer.length;
- while(size < newSize)
- size *= 2;
- char[] newBuffer = new char[size];
- System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
- termBuffer = newBuffer;
+ char[] newCharBuffer = growTermBuffer(newSize);
+ if (termBuffer == null) {
+ // If there were termText, then preserve it.
+ // note that if termBuffer is null then newCharBuffer cannot be null
+ assert newCharBuffer != null;
+ if (termText != null) {
+ termText.getChars(0, termText.length(), newCharBuffer, 0);
+ }
+ termBuffer = newCharBuffer;
+ } else if (newCharBuffer != null) {
+ // Note: if newCharBuffer != null then termBuffer needs to grow.
+ // If there were a termBuffer, then preserve it
+ System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
+ termBuffer = newCharBuffer;
}
+ termText = null;
return termBuffer;
}
+ /** Allocates a buffer char[] of at least newSize
+ * @param newSize minimum size of the buffer
+ * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
+ */
+ private char[] growTermBuffer(int newSize) {
+ if (termBuffer != null) {
+ if (termBuffer.length >= newSize)
+ // Already big enough
+ return null;
+ else
+ // Not big enough; create a new array with slight
+ // over allocation:
+ return new char[ArrayUtil.getNextSize(newSize)];
+ } else {
+
+ // determine the best size
+ // The buffer is always at least MIN_BUFFER_SIZE
+ if (newSize < MIN_BUFFER_SIZE) {
+ newSize = MIN_BUFFER_SIZE;
+ }
+
+ // If there is already a termText, then the size has to be at least that big
+ if (termText != null) {
+ int ttLength = termText.length();
+ if (newSize < ttLength) {
+ newSize = ttLength;
+ }
+ }
+
+ return new char[newSize];
+ }
+ }
+
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
@@ -308,9 +520,16 @@
}
/** Set number of valid characters (length of the term) in
- * the termBuffer array. */
+ * the termBuffer array. Use this to truncate the termBuffer
+ * or to synchronize with external manipulation of the termBuffer.
+ * Note: to grow the size of the array,
+ * use {@link #resizeTermBuffer(int)} first.
+ * @param length the truncated length
+ */
public final void setTermLength(int length) {
initTermBuffer();
+ if (length > termBuffer.length)
+ throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
termLength = length;
}
@@ -331,7 +550,8 @@
}
/** Returns this Token's ending offset, one greater than the position of the
- last character corresponding to this token in the source text. */
+ last character corresponding to this token in the source text. The length
+ of the token in the source text is (endOffset - startOffset). */
public final int endOffset() {
return endOffset;
}
@@ -374,8 +594,6 @@
this.flags = flags;
}
-
-
/**
* Returns this Token's payload.
*/
@@ -424,9 +642,9 @@
public Object clone() {
try {
Token t = (Token)super.clone();
+ // Do a deep clone
if (termBuffer != null) {
- t.termBuffer = null;
- t.setTermBuffer(termBuffer, 0, termLength);
+ t.termBuffer = (char[]) termBuffer.clone();
}
if (payload != null) {
t.setPayload((Payload) payload.clone());
Index: src/java/org/apache/lucene/analysis/TokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
@@ -23,7 +23,7 @@
This is an abstract class.
NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Index: src/java/org/apache/lucene/analysis/LengthFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy)
@@ -42,7 +42,7 @@
}
/**
- * Returns the next input Token whose termText() is the right len
+ * Returns the next input Token whose term() is the right len
*/
public final Token next(Token result) throws IOException
{
Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 683439)
+++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy)
@@ -34,7 +34,7 @@
if (result != null) {
final char[] buffer = result.termBuffer();
- final int length = result.termLength;
+ final int length = result.termLength();
for(int i=0;i{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
- NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ NOTE: subclasses must override {@link #next(Token)}. It's
+ also OK to instead override {@link #next()} but that
+ method is now deprecated in favor of {@link #next(Token)}.
*/
public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS.
- * The returned Token is a "full private copy" (not
+ * @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
* than calling {@link #next(Token)} instead.. */
public Token next() throws IOException {
Index: src/java/org/apache/lucene/search/QueryTermVector.java
===================================================================
--- src/java/org/apache/lucene/search/QueryTermVector.java (revision 683439)
+++ src/java/org/apache/lucene/search/QueryTermVector.java (working copy)
@@ -17,15 +17,20 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.TermFreqVector;
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.*;
-
/**
*
*
@@ -51,12 +56,10 @@
TokenStream stream = analyzer.tokenStream("", new StringReader(queryString));
if (stream != null)
{
- Token next = null;
List terms = new ArrayList();
try {
- while ((next = stream.next()) != null)
- {
- terms.add(next.termText());
+ for (Token next = stream.next(new Token()); next != null; next = stream.next(next)) {
+ terms.add(next.term());
}
processTerms((String[])terms.toArray(new String[terms.size()]));
} catch (IOException e) {
Index: src/java/org/apache/lucene/index/DocInverterPerField.java
===================================================================
--- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 683439)
+++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy)
@@ -81,13 +81,9 @@
final int valueLength = stringValue.length();
Token token = perThread.localToken;
token.clear();
- char[] termBuffer = token.termBuffer();
- if (termBuffer.length < valueLength)
- termBuffer = token.resizeTermBuffer(valueLength);
- stringValue.getChars(0, valueLength, termBuffer, 0);
- token.setTermLength(valueLength);
token.setStartOffset(fieldState.offset);
- token.setEndOffset(fieldState.offset + stringValue.length());
+ token.setEndOffset(fieldState.offset + valueLength);
+ token.setTermBuffer(stringValue, 0, valueLength);
boolean success = false;
try {
consumer.add(token);
Index: contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
===================================================================
--- contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 683439)
+++ contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy)
@@ -54,11 +54,13 @@
* .
*/
-import java.io.*;
+import java.io.StringReader;
-import junit.framework.*;
+import junit.framework.TestCase;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
public class TestSnowball extends TestCase {
@@ -66,12 +68,12 @@
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ Token token = new Token();
for (int i = 0; i < output.length; i++) {
- Token t = ts.next();
- assertNotNull(t);
- assertEquals(output[i], t.termText());
+ Token t = ts.next(token);
+ assertEquals(output[i], t.term());
}
- assertNull(ts.next());
+ assertNull(ts.next(token));
ts.close();
}
@@ -83,21 +85,22 @@
public void testFilterTokens() throws Exception {
- final Token tok = new Token("accents", 2, 7, "wrd");
+ final Token tok = new Token(2, 7, "wrd");
+ tok.setTermBuffer("accents");
tok.setPositionIncrement(3);
SnowballFilter filter = new SnowballFilter(
new TokenStream() {
- public Token next() {
+ public Token next(Token token) {
return tok;
}
},
"English"
);
- Token newtok = filter.next();
+ Token newtok = filter.next(new Token());
- assertEquals("accent", newtok.termText());
+ assertEquals("accent", newtok.term());
assertEquals(2, newtok.startOffset());
assertEquals(7, newtok.endOffset());
assertEquals("wrd", newtok.type());
Index: contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
===================================================================
--- contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 683439)
+++ contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy)
@@ -18,11 +18,10 @@
*/
import java.io.IOException;
-
import java.lang.reflect.Method;
import net.sf.snowball.SnowballProgram;
-import net.sf.snowball.ext.*;
+import net.sf.snowball.ext.EnglishStemmer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
@@ -60,20 +59,21 @@
}
/** Returns the next input Token, after being stemmed */
- public final Token next() throws IOException {
- Token token = input.next();
+ public final Token next(Token token) throws IOException {
+ token = input.next(token);
if (token == null)
return null;
- stemmer.setCurrent(token.termText());
+ String originalTerm = token.term();
+ stemmer.setCurrent(originalTerm);
try {
stemMethod.invoke(stemmer, EMPTY_ARGS);
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
-
- Token newToken = new Token(stemmer.getCurrent(),
- token.startOffset(), token.endOffset(), token.type());
- newToken.setPositionIncrement(token.getPositionIncrement());
- return newToken;
+ String finalTerm = stemmer.getCurrent();
+ // Don't bother updating, if it is unchanged.
+ if (!originalTerm.equals(finalTerm))
+ token.setTermBuffer(finalTerm);
+ return token;
}
}
Index: contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
===================================================================
--- contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java (revision 683439)
+++ contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java (working copy)
@@ -17,15 +17,29 @@
* limitations under the License.
*/
-import org.apache.lucene.store.*;
-import org.apache.lucene.search.*;
-import org.apache.lucene.index.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.analysis.*;
-import java.io.*;
-import java.util.*;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.FSDirectory;
+
/**
* Test program to look up synonyms.
*/
@@ -86,10 +100,8 @@
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
- org.apache.lucene.analysis.Token t;
- while ( (t = ts.next()) != null)
- {
- String word = t.termText();
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ String word = token.term();
if ( already.add( word))
top.add( word);
}
Index: contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
===================================================================
--- contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java (revision 683439)
+++ contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java (working copy)
@@ -17,16 +17,30 @@
* limitations under the License.
*/
-import org.apache.lucene.store.*;
-import org.apache.lucene.search.*;
-import org.apache.lucene.index.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.standard.*;
-import java.io.*;
-import java.util.*;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.FSDirectory;
+
/**
* Expand a query by looking up synonyms for every term.
* You need to invoke {@link Syns2Index} first to build the synonym index.
@@ -99,10 +113,8 @@
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
- org.apache.lucene.analysis.Token t;
- while ( (t = ts.next()) != null)
- {
- String word = t.termText();
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ String word = token.term();
if ( already.add( word))
top.add( word);
}
Index: contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
===================================================================
--- contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (revision 683439)
+++ contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (working copy)
@@ -15,19 +15,32 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
import junit.framework.TestCase;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
-import java.io.IOException;
-import java.util.*;
-
/**
* Asserts equality of content and behaviour of two index readers.
*/
@@ -151,21 +164,23 @@
document.add(f);
if (i > 4) {
final List tokens = new ArrayList(2);
- Token t = new Token("the", 0, 2, "text");
+ Token t = createToken("the", 0, 2, "text");
t.setPayload(new Payload(new byte[]{1, 2, 3}));
tokens.add(t);
- t = new Token("end", 3, 5, "text");
+ t = createToken("end", 3, 5, "text");
t.setPayload(new Payload(new byte[]{2}));
tokens.add(t);
- tokens.add(new Token("fin", 7, 9));
+ tokens.add(createToken("fin", 7, 9));
document.add(new Field("f", new TokenStream() {
Iterator it = tokens.iterator();
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (!it.hasNext()) {
return null;
}
- return it.next();
+ // Resettable token streams need to return clones.
+ token = (Token) it.next();
+ return (Token) token.clone();
}
public void reset() throws IOException {
@@ -466,4 +481,19 @@
testReader.close();
}
+ private static Token createToken(String term, int start, int offset)
+ {
+ Token token = new Token(start, offset);
+ token.setTermBuffer(term);
+ return token;
+ }
+
+ private static Token createToken(String term, int start, int offset, String type)
+ {
+ Token token = new Token(start, offset, type);
+ token.setTermBuffer(term);
+ return token;
+ }
+
+
}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 683439)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (working copy)
@@ -520,12 +520,13 @@
} else {
tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
- Token next = tokenStream.next();
+ Token next = new Token();
+ next = tokenStream.next(next);
+
while (next != null) {
- next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
- tokens.add(next); // the vector will be built on commit.
- next = tokenStream.next();
+ tokens.add((Token) next.clone()); // the vector will be built on commit.
+ next = tokenStream.next(next);
fieldSetting.fieldLength++;
if (fieldSetting.fieldLength > maxFieldLength) {
break;
@@ -533,7 +534,10 @@
}
} else {
// untokenized
- tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
+ String fieldVal = field.stringValue();
+ Token token = new Token(0, fieldVal.length(), "untokenized");
+ token.setTermBuffer(fieldVal);
+ tokens.add(token);
fieldSetting.fieldLength++;
}
}
@@ -567,10 +571,10 @@
for (Token token : eField_Tokens.getValue()) {
- TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText());
+ TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
if (termDocumentInformationFactory == null) {
termDocumentInformationFactory = new TermDocumentInformationFactory();
- termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory);
+ termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
}
//termDocumentInformationFactory.termFrequency++;
Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java
===================================================================
--- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java (revision 683439)
+++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java (working copy)
@@ -59,20 +59,21 @@
try
{
- Token token = ts.next();
+ Token token = new Token();
+ token = ts.next(token);
Term term = null;
while (token != null)
{
if (term == null)
{
- term = new Term(fieldName, token.termText());
+ term = new Term(fieldName, token.term());
} else
{
// create from previous to save fieldName.intern overhead
- term = term.createTerm(token.termText());
+ term = term.createTerm(token.term());
}
tf.addTerm(term);
- token = ts.next();
+ token = ts.next(token);
}
}
catch (IOException ioe)
Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java
===================================================================
--- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (revision 683439)
+++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (working copy)
@@ -74,16 +74,17 @@
if((stopWords!=null)&&(fields!=null))
{
stopWordsSet=new HashSet();
+ Token stopToken = new Token();
for (int i = 0; i < fields.length; i++)
{
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
try
{
- Token stopToken=ts.next();
+ stopToken=ts.next(stopToken);
while(stopToken!=null)
{
- stopWordsSet.add(stopToken.termText());
- stopToken=ts.next();
+ stopWordsSet.add(stopToken.term());
+ stopToken=ts.next(stopToken);
}
}
catch(IOException ioe)
Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java
===================================================================
--- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java (revision 683439)
+++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java (working copy)
@@ -52,12 +52,13 @@
{
ArrayList clausesList=new ArrayList();
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
- Token token=ts.next();
+ Token token= new Token();
+ token = ts.next(token);
while(token!=null)
{
- SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText()));
+ SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.term()));
clausesList.add(stq);
- token=ts.next();
+ token=ts.next(token);
}
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
Index: contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java
===================================================================
--- contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java (revision 683439)
+++ contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java (working copy)
@@ -58,20 +58,21 @@
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
try
{
- Token token = ts.next();
+ Token token = new Token();
+ token = ts.next(token);
Term term = null;
while (token != null)
{
if (term == null)
{
- term = new Term(fieldName, token.termText());
+ term = new Term(fieldName, token.term());
} else
{
// create from previous to save fieldName.intern overhead
- term = term.createTerm(token.termText());
+ term = term.createTerm(token.term());
}
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
- token = ts.next();
+ token = ts.next(token);
}
}
catch (IOException ioe)
Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
===================================================================
--- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 683439)
+++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy)
@@ -1120,21 +1120,21 @@
{
lst = new ArrayList();
Token t;
- t = new Token("hi", 0, 2);
+ t = createToken("hi", 0, 2);
lst.add(t);
- t = new Token("hispeed", 0, 8);
+ t = createToken("hispeed", 0, 8);
lst.add(t);
- t = new Token("speed", 3, 8);
+ t = createToken("speed", 3, 8);
t.setPositionIncrement(0);
lst.add(t);
- t = new Token("10", 8, 10);
+ t = createToken("10", 8, 10);
lst.add(t);
- t = new Token("foo", 11, 14);
+ t = createToken("foo", 11, 14);
lst.add(t);
iter = lst.iterator();
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
return iter.hasNext() ? (Token) iter.next() : null;
}
};
@@ -1149,21 +1149,21 @@
{
lst = new ArrayList();
Token t;
- t = new Token("hispeed", 0, 8);
+ t = createToken("hispeed", 0, 8);
lst.add(t);
- t = new Token("hi", 0, 2);
+ t = createToken("hi", 0, 2);
t.setPositionIncrement(0);
lst.add(t);
- t = new Token("speed", 3, 8);
+ t = createToken("speed", 3, 8);
lst.add(t);
- t = new Token("10", 8, 10);
+ t = createToken("10", 8, 10);
lst.add(t);
- t = new Token("foo", 11, 14);
+ t = createToken("foo", 11, 14);
lst.add(t);
iter = lst.iterator();
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
return iter.hasNext() ? (Token) iter.next() : null;
}
};
@@ -1346,6 +1346,13 @@
super.tearDown();
}
+ private static Token createToken(String term, int start, int offset)
+ {
+ Token token = new Token(start, offset);
+ token.setTermBuffer(term);
+ return token;
+ }
+
}
// ===================================================================
@@ -1392,31 +1399,33 @@
this.synonyms = synonyms;
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (currentRealToken == null) {
- Token nextRealToken = realStream.next();
+ Token nextRealToken = realStream.next(token);
if (nextRealToken == null) {
return null;
}
- String expansions = (String) synonyms.get(nextRealToken.termText());
+ String expansions = (String) synonyms.get(nextRealToken.term());
if (expansions == null) {
return nextRealToken;
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens()) {
- currentRealToken = nextRealToken;
+ currentRealToken = (Token) nextRealToken.clone();
}
return currentRealToken;
} else {
- String nextExpandedValue = st.nextToken();
- Token expandedToken = new Token(nextExpandedValue, currentRealToken.startOffset(),
- currentRealToken.endOffset());
- expandedToken.setPositionIncrement(0);
+ token.clear();
+ token.setTermBuffer(st.nextToken());
+ token.setStartOffset(currentRealToken.startOffset());
+ token.setEndOffset(currentRealToken.endOffset());
+ token.setType(Token.DEFAULT_TYPE);
+ token.setPositionIncrement(0);
if (!st.hasMoreTokens()) {
currentRealToken = null;
st = null;
}
- return expandedToken;
+ return token;
}
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (working copy)
@@ -121,7 +121,7 @@
*/
public float getTokenScore(Token token) {
position += token.getPositionIncrement();
- String termText = new String(token.termBuffer(), 0, token.termLength());
+ String termText = token.term();
WeightedSpanTerm weightedSpanTerm;
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy)
@@ -106,7 +106,7 @@
*/
public float getTokenScore(Token token)
{
- String termText=token.termText();
+ String termText=token.term();
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
if(queryTerm==null)
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy)
@@ -147,7 +147,7 @@
{
this.tokens=tokens;
}
- public Token next()
+ public Token next(Token token)
{
if(currentToken>=tokens.length)
{
@@ -160,6 +160,7 @@
String[] terms=tpv.getTerms();
int[] freq=tpv.getTermFrequencies();
int totalTokens=0;
+ Token newToken = new Token();
for (int t = 0; t < freq.length; t++)
{
totalTokens+=freq[t];
@@ -189,9 +190,12 @@
}
for (int tp = 0; tp < offsets.length; tp++)
{
- unsortedTokens.add(new Token(terms[t],
- offsets[tp].getStartOffset(),
- offsets[tp].getEndOffset()));
+ newToken.clear();
+ newToken.setTermBuffer(terms[t]);
+ newToken.setStartOffset(offsets[tp].getStartOffset());
+ newToken.setEndOffset(offsets[tp].getEndOffset());
+ newToken.setType(Token.DEFAULT_TYPE);
+ unsortedTokens.add(newToken.clone());
}
}
else
@@ -204,9 +208,12 @@
//tokens stored with positions - can use this to index straight into sorted array
for (int tp = 0; tp < pos.length; tp++)
{
- tokensInOriginalOrder[pos[tp]]=new Token(terms[t],
- offsets[tp].getStartOffset(),
- offsets[tp].getEndOffset());
+ newToken.clear();
+ newToken.setTermBuffer(terms[t]);
+ newToken.setStartOffset(offsets[tp].getStartOffset());
+ newToken.setEndOffset(offsets[tp].getEndOffset());
+ newToken.setType(Token.DEFAULT_TYPE);
+ tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
}
}
}
@@ -261,7 +268,7 @@
}
return getTokenStream(field, contents, analyzer);
}
- //conevenience method
+ //convenience method
public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
return analyzer.tokenStream(field,new StringReader(contents));
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (working copy)
@@ -62,7 +62,7 @@
return false;
}
- WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength()));
+ WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
if (wSpanTerm != null) {
List positionSpans = wSpanTerm.getPositionSpans();
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy)
@@ -61,7 +61,7 @@
tot+=score;
}
}
- tokens[numTokens]=token;
+ tokens[numTokens]= (Token) token.clone();
scores[numTokens]=score;
numTokens++;
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 683439)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy)
@@ -22,6 +22,7 @@
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
@@ -217,7 +218,7 @@
try
{
- org.apache.lucene.analysis.Token token;
+ Token token = new Token();
String tokenText;
int startOffset;
int endOffset;
@@ -225,7 +226,7 @@
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
- token = tokenStream.next();
+ token = tokenStream.next(token);
while ((token!= null)&&(token.startOffset()< maxDocCharsToAnalyze))
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
@@ -261,7 +262,7 @@
// {
// break;
// }
- token = tokenStream.next();
+ token = tokenStream.next(token);
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
Index: contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java
===================================================================
--- contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (revision 683439)
+++ contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java (working copy)
@@ -57,19 +57,25 @@
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (inPhrase) {
inPhrase = false;
- return new Token("phrase2", savedStart, savedEnd);
+ token.setTermBuffer("phrase2");
+ token.setStartOffset(savedStart);
+ token.setEndOffset(savedEnd);
+ return token;
} else
- for (Token token = input.next(); token != null; token = input.next()) {
- if (token.termText().equals("phrase")) {
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ if (token.term().equals("phrase")) {
inPhrase = true;
savedStart = token.startOffset();
savedEnd = token.endOffset();
- return new Token("phrase1", savedStart, savedEnd);
- } else if (!token.termText().equals("stop"))
+ token.setTermBuffer("phrase1");
+ token.setStartOffset(savedStart);
+ token.setEndOffset(savedEnd);
return token;
+ } else if (!token.term().equals("stop"))
+ return token;
}
return null;
}
Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
===================================================================
--- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java (revision 683439)
+++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java (working copy)
@@ -23,6 +23,7 @@
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
@@ -105,21 +106,22 @@
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
- org.apache.lucene.analysis.Token t;
+ Token token = new Token();
int countTokens = 0;
while (true) {
try {
- t = source.next();
+ token = source.next(token);
} catch (IOException e) {
- t = null;
+ token = null;
}
- if (t == null) {
+ if (token == null) {
break;
}
- if (!"".equals(t.termText())) {
+ String term = token.term();
+ if (!"".equals(term)) {
try {
- tlist.set(countTokens++, t.termText());
+ tlist.set(countTokens++, term);
} catch (IndexOutOfBoundsException ioobe) {
countTokens = -1;
}
@@ -189,18 +191,18 @@
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
List tlist = new ArrayList();
- org.apache.lucene.analysis.Token t;
+ Token token = new Token();
while (true) {
try {
- t = source.next();
+ token = source.next(token);
} catch (IOException e) {
- t = null;
+ token = null;
}
- if (t == null) {
+ if (token == null) {
break;
}
- tlist.add(t.termText());
+ tlist.add(token.term());
}
try {
@@ -238,14 +240,15 @@
throws ParseException {
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
- org.apache.lucene.analysis.Token t;
+ Token token = new Token();
+ Token multipleToken = new Token();
boolean multipleTokens = false;
try {
- t = source.next();
- multipleTokens = source.next() != null;
+ token = source.next(token);
+ multipleTokens = source.next(multipleToken) != null;
} catch (IOException e) {
- t = null;
+ token = null;
}
try {
@@ -259,7 +262,7 @@
+ " - tokens were added");
}
- return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity);
+ return (token == null) ? null : super.getFuzzyQuery(field, token.term(), minSimilarity);
}
/**
@@ -270,18 +273,19 @@
throws ParseException {
// get Analyzer from superclass and tokenize the terms
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
- org.apache.lucene.analysis.Token t;
+ Token token = new Token();
+ Token multipleToken = new Token();
boolean multipleTokens = false;
// part1
try {
- t = source.next();
- if (t != null) {
- part1 = t.termText();
+ token = source.next(token);
+ if (token != null) {
+ part1 = token.term();
}
- multipleTokens = source.next() != null;
+ multipleTokens = source.next(multipleToken) != null;
} catch (IOException e) {
- t = null;
+ token = null;
}
try {
source.close();
@@ -293,16 +297,20 @@
+ " - tokens were added to part1");
}
+ // part2
source = getAnalyzer().tokenStream(field, new StringReader(part2));
- // part2
+ if (token == null)
+ token = new Token();
+ if (multipleToken == null)
+ multipleToken = new Token();
try {
- t = source.next();
- if (t != null) {
- part2 = t.termText();
+ token = source.next(token);
+ if (token != null) {
+ part2 = token.term();
}
- multipleTokens = source.next() != null;
+ multipleTokens = source.next(multipleToken) != null;
} catch (IOException e) {
- t = null;
+ token = null;
}
try {
source.close();
Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java
===================================================================
--- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (revision 683439)
+++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.java (working copy)
@@ -1,14 +1,29 @@
/* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParser.java */
package org.apache.lucene.queryParser.precedence;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
import java.util.Vector;
-import java.io.*;
-import java.text.*;
-import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RangeQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@@ -296,21 +311,21 @@
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
- org.apache.lucene.analysis.Token t;
+ org.apache.lucene.analysis.Token token = new org.apache.lucene.analysis.Token();
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
- t = source.next();
+ token = source.next(token);
}
catch (IOException e) {
- t = null;
+ token = null;
}
- if (t == null)
+ if (token == null)
break;
- v.addElement(t);
- if (t.getPositionIncrement() == 1)
+ v.addElement(token.clone());
+ if (token.getPositionIncrement() == 1)
positionCount++;
else
severalTokensAtSamePosition = true;
@@ -325,17 +340,17 @@
if (v.size() == 0)
return null;
else if (v.size() == 1) {
- t = (org.apache.lucene.analysis.Token) v.elementAt(0);
- return new TermQuery(new Term(field, t.termText()));
+ token = (org.apache.lucene.analysis.Token) v.elementAt(0);
+ return new TermQuery(new Term(field, token.term()));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = new BooleanQuery();
for (int i = 0; i < v.size(); i++) {
- t = (org.apache.lucene.analysis.Token) v.elementAt(i);
+ token = (org.apache.lucene.analysis.Token) v.elementAt(i);
TermQuery currentQuery = new TermQuery(
- new Term(field, t.termText()));
+ new Term(field, token.term()));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@@ -345,12 +360,12 @@
MultiPhraseQuery mpq = new MultiPhraseQuery();
List multiTerms = new ArrayList();
for (int i = 0; i < v.size(); i++) {
- t = (org.apache.lucene.analysis.Token) v.elementAt(i);
- if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
+ token = (org.apache.lucene.analysis.Token) v.elementAt(i);
+ if (token.getPositionIncrement() == 1 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
multiTerms.clear();
}
- multiTerms.add(new Term(field, t.termText()));
+ multiTerms.add(new Term(field, token.term()));
}
mpq.add((Term[])multiTerms.toArray(new Term[0]));
return mpq;
@@ -361,7 +376,7 @@
q.setSlop(phraseSlop);
for (int i = 0; i < v.size(); i++) {
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
- v.elementAt(i)).termText()));
+ v.elementAt(i)).term()));
}
return q;
Index: contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj
===================================================================
--- contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj (revision 683439)
+++ contrib/miscellaneous/src/java/org/apache/lucene/queryParser/precedence/PrecedenceQueryParser.jj (working copy)
@@ -25,14 +25,29 @@
package org.apache.lucene.queryParser.precedence;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
import java.util.Vector;
-import java.io.*;
-import java.text.*;
-import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RangeQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@@ -320,21 +335,21 @@
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
- org.apache.lucene.analysis.Token t;
+ org.apache.lucene.analysis.Token token = new org.apache.lucene.analysis.Token();
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
- t = source.next();
+ token = source.next(token);
}
catch (IOException e) {
- t = null;
+ token = null;
}
- if (t == null)
+ if (token == null)
break;
- v.addElement(t);
- if (t.getPositionIncrement() == 1)
+ v.addElement(token.clone());
+ if (token.getPositionIncrement() == 1)
positionCount++;
else
severalTokensAtSamePosition = true;
Index: contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
===================================================================
--- contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (revision 683439)
+++ contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (working copy)
@@ -126,14 +126,13 @@
tcm.put("3.25", "");
tcm.put("3.50", "");
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
- Token token = new Token();
int count = 0;
int numItalics = 0;
int numBoldItalics = 0;
int numCategory = 0;
int numCitation = 0;
- while ((token = tf.next(token)) != null) {
- String tokText = token.termText();
+ for (Token token = tf.next(new Token()); token != null; token = tf.next(token)) {
+ String tokText = token.term();
//System.out.println("Text: " + tokText + " Type: " + token.type());
assertTrue("token is null and it shouldn't be", token != null);
String expectedType = (String) tcm.get(tokText);
@@ -169,73 +168,73 @@
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+ assertTrue(token.term() + " is not equal to " + "click", token.term().equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.term() + " is not equal to " + "link", token.term().equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
- new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.term() + " is not equal to " + "here",
+ token.term().equals("here") == true);
//The link, and here should be at the same position for phrases to work
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
- new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
+ assertTrue(token.term() + " is not equal to " + "again",
+ token.term().equals("again") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
- new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+ assertTrue(token.term() + " is not equal to " + "click",
+ token.term().equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
- new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
+ assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org",
+ token.term().equals("http://lucene.apache.org") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
- new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.term() + " is not equal to " + "here",
+ token.term().equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
- new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
+ assertTrue(token.term() + " is not equal to " + "again",
+ token.term().equals("again") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
- new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
+ assertTrue(token.term() + " is not equal to " + "a",
+ token.term().equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
- new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
+ assertTrue(token.term() + " is not equal to " + "b",
+ token.term().equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
- new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
+ assertTrue(token.term() + " is not equal to " + "c",
+ token.term().equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
- new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
+ assertTrue(token.term() + " is not equal to " + "d",
+ token.term().equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is not null and it should be", token == null);
}
@@ -245,25 +244,25 @@
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
- new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
+ assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
+ token.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(token);//skip here
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
- new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
+ assertTrue(token.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
+ token.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(token);//skip here
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
- new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
+ assertTrue(token.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
+ token.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is not null and it should be", token == null);
}
@@ -277,71 +276,71 @@
checkLinkPhrases(tf);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
- Token token;
- token = tf.next();
+ Token token = new Token();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
- new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
+ assertTrue(token.term() + " is not equal to " + "a b c d",
+ token.term().equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
- new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
+ assertTrue(token.term() + " is not equal to " + "e f g",
+ token.term().equals("e f g") == true);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
- new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.term() + " is not equal to " + "link",
+ token.term().equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
- new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.term() + " is not equal to " + "here",
+ token.term().equals("here") == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
- new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.term() + " is not equal to " + "link",
+ token.term().equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
- new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
+ assertTrue(token.term() + " is not equal to " + "there",
+ token.term().equals("there") == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
- new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
+ assertTrue(token.term() + " is not equal to " + "italics here",
+ token.term().equals("italics here") == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
- new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
+ assertTrue(token.term() + " is not equal to " + "something",
+ token.term().equals("something") == true);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
- new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
+ assertTrue(token.term() + " is not equal to " + "more italics",
+ token.term().equals("more italics") == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
- new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
+ assertTrue(token.term() + " is not equal to " + "h i j",
+ token.term().equals("h i j") == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is not null and it should be", token == null);
}
@@ -352,48 +351,48 @@
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
- Token token;
- token = tf.next();
+ Token token = new Token();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
- new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
+ assertTrue(token.term() + " is not equal to " + "a b c d",
+ token.term().equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
- new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
+ assertTrue(token.term() + " is not equal to " + "a",
+ token.term().equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
- new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
+ assertTrue(token.term() + " is not equal to " + "b",
+ token.term().equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
- new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
+ assertTrue(token.term() + " is not equal to " + "c",
+ token.term().equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
- new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
+ assertTrue(token.term() + " is not equal to " + "d",
+ token.term().equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
@@ -401,175 +400,175 @@
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
- new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
+ assertTrue(token.term() + " is not equal to " + "e f g",
+ token.term().equals("e f g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
- new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
+ assertTrue(token.term() + " is not equal to " + "e",
+ token.term().equals("e") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
- new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
+ assertTrue(token.term() + " is not equal to " + "f",
+ token.term().equals("f") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
- new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
+ assertTrue(token.term() + " is not equal to " + "g",
+ token.term().equals("g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
- new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.term() + " is not equal to " + "link",
+ token.term().equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
- new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.term() + " is not equal to " + "here",
+ token.term().equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
- new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.term() + " is not equal to " + "link",
+ token.term().equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
- new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
+ assertTrue(token.term() + " is not equal to " + "there",
+ token.term().equals("there") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
- new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
+ assertTrue(token.term() + " is not equal to " + "italics here",
+ token.term().equals("italics here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
- new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
+ assertTrue(token.term() + " is not equal to " + "italics",
+ token.term().equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
- new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.term() + " is not equal to " + "here",
+ token.term().equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
- new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
+ assertTrue(token.term() + " is not equal to " + "something",
+ token.term().equals("something") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
- new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
+ assertTrue(token.term() + " is not equal to " + "more italics",
+ token.term().equals("more italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
- new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
+ assertTrue(token.term() + " is not equal to " + "more",
+ token.term().equals("more") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
- new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
+ assertTrue(token.term() + " is not equal to " + "italics",
+ token.term().equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
- new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
+ assertTrue(token.term() + " is not equal to " + "h i j",
+ token.term().equals("h i j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
- new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
+ assertTrue(token.term() + " is not equal to " + "h",
+ token.term().equals("h") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
- new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
+ assertTrue(token.term() + " is not equal to " + "i",
+ token.term().equals("i") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
- assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
- new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
+ assertTrue(token.term() + " is not equal to " + "j",
+ token.term().equals("j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
- token = tf.next();
+ token = tf.next(token);
assertTrue("token is not null and it should be", token == null);
}
Index: contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
===================================================================
--- contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (revision 683439)
+++ contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (working copy)
@@ -197,9 +197,8 @@
private List getTokens(TokenStream stream) throws IOException {
ArrayList tokens = new ArrayList();
- Token token;
- while ((token = stream.next()) != null) {
- tokens.add(token);
+ for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) {
+ tokens.add(token.clone());
}
return tokens;
}
@@ -211,7 +210,7 @@
for (; i < size; i++) {
Token t1 = (Token) tokens1.get(i);
Token t2 = (Token) tokens2.get(i);
- if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
+ if (!(t1.term().equals(t2.term()))) throw new IllegalStateException("termText");
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
@@ -222,8 +221,8 @@
catch (IllegalStateException e) {
if (size > 0) {
System.out.println("i=" + i + ", size=" + size);
- System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
- System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
+ System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).term() + "'");
+ System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).term() + "'");
}
throw e;
}
@@ -234,7 +233,7 @@
String str = "[";
for (int i=0; i < tokens.size(); i++) {
Token t1 = (Token) tokens.get(i);
- str = str + "'" + t1.termText() + "', ";
+ str = str + "'" + t1.term() + "', ";
}
return str + "]";
}
Index: contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
===================================================================
--- contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 683439)
+++ contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy)
@@ -275,7 +275,7 @@
return new TokenStream() {
private Iterator iter = keywords.iterator();
private int start = 0;
- public Token next() {
+ public Token next(Token token) {
if (!iter.hasNext()) return null;
Object obj = iter.next();
@@ -283,7 +283,11 @@
throw new IllegalArgumentException("keyword must not be null");
String term = obj.toString();
- Token token = new Token(term, start, start + term.length());
+ token.clear();
+ token.setTermBuffer(term);
+ token.setStartOffset(start);
+ token.setEndOffset(start + token.termLength());
+ token.setType(Token.DEFAULT_TYPE);
start += term.length() + 1; // separate words by 1 (blank) character
return token;
}
@@ -349,10 +353,8 @@
HashMap terms = new HashMap();
int numTokens = 0;
int pos = -1;
- Token token;
-
- while ((token = stream.next()) != null) {
- String term = token.termText();
+ for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) {
+ String term = token.term();
if (term.length() == 0) continue; // nothing to do
// if (DEBUG) System.err.println("token='" + term + "'");
numTokens++;
Index: contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
===================================================================
--- contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (revision 683439)
+++ contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (working copy)
@@ -73,8 +73,8 @@
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int position = -1;
- public Token next() throws IOException {
- Token token = input.next(); // from filter super class
+ public Token next(Token token) throws IOException {
+ token = input.next(token); // from filter super class
log.println(toString(token));
return token;
}
@@ -84,7 +84,7 @@
position += token.getPositionIncrement();
return "[" + logName + ":" + position + ":" + fieldName + ":"
- + token.termText() + ":" + token.startOffset()
+ + token.term() + ":" + token.startOffset()
+ "-" + token.endOffset() + ":" + token.type()
+ "]";
}
@@ -121,8 +121,8 @@
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int todo = maxTokens;
- public Token next() throws IOException {
- return --todo >= 0 ? input.next() : null;
+ public Token next(Token token) throws IOException {
+ return --todo >= 0 ? input.next(token) : null;
}
};
}
@@ -239,9 +239,9 @@
final ArrayList tokens2 = new ArrayList();
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
- public Token next() throws IOException {
- Token token = input.next(); // from filter super class
- if (token != null) tokens2.add(token);
+ public Token next(Token token) throws IOException {
+ token = input.next(token); // from filter super class
+ if (token != null) tokens2.add(token.clone());
return token;
}
};
@@ -253,7 +253,7 @@
private Iterator iter = tokens.iterator();
- public Token next() {
+ public Token next(Token token) {
if (!iter.hasNext()) return null;
return (Token) iter.next();
}
@@ -300,12 +300,11 @@
HashMap map = new HashMap();
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
try {
- Token token;
- while ((token = stream.next()) != null) {
- MutableInteger freq = (MutableInteger) map.get(token.termText());
+ for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) {
+ MutableInteger freq = (MutableInteger) map.get(token.term());
if (freq == null) {
freq = new MutableInteger(1);
- map.put(token.termText(), freq);
+ map.put(token.term(), freq);
} else {
freq.setValue(freq.intValue() + 1);
}
Index: contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
===================================================================
--- contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (revision 683439)
+++ contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (working copy)
@@ -334,7 +334,7 @@
this.toLowerCase = toLowerCase;
}
- public Token next() {
+ public Token next(Token token) {
if (matcher == null) return null;
while (true) { // loop takes care of leading and trailing boundary cases
@@ -352,7 +352,12 @@
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
- return new Token(text, start, end);
+ token.clear();
+ token.setTermBuffer(text);
+ token.setStartOffset(start);
+ token.setEndOffset(end);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
}
if (!isMatch) return null;
}
@@ -384,7 +389,7 @@
this.stopWords = stopWords;
}
- public Token next() {
+ public Token next(Token token) {
// cache loop instance vars (performance)
String s = str;
int len = s.length();
@@ -422,7 +427,16 @@
} while (text != null && isStopWord(text));
pos = i;
- return text != null ? new Token(text, start, i) : null;
+ if (text == null)
+ {
+ return null;
+ }
+ token.clear();
+ token.setTermBuffer(text);
+ token.setStartOffset(start);
+ token.setEndOffset(i);
+ token.setType(Token.DEFAULT_TYPE);
+ return token;
}
private boolean isTokenChar(char c, boolean isLetter) {
Index: contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
===================================================================
--- contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (revision 683439)
+++ contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (working copy)
@@ -68,23 +68,22 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public Token next() throws IOException {
- Token token;
+ public Token next(Token token) throws IOException {
while (todo > 0 && index < stack.length) { // pop from stack
- token = createToken(stack[index++], current);
+ token = createToken(stack[index++], current, token);
if (token != null) {
todo--;
return token;
}
}
- token = input.next();
+ token = input.next(token);
if (token == null) return null; // EOS; iterator exhausted
- stack = synonyms.getSynonyms(token.termText()); // push onto stack
+ stack = synonyms.getSynonyms(token.term()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
- current = token;
+ current = (Token) token.clone();
todo = maxSynonyms;
return token;
}
@@ -100,11 +99,15 @@
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
- protected Token createToken(String synonym, Token current) {
- Token token = new Token(
- synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
- token.setPositionIncrement(0);
- return token;
+ protected Token createToken(String synonym, Token current, Token result) {
+ result.setTermBuffer(synonym);
+ result.setStartOffset(current.startOffset());
+ result.setEndOffset(current.endOffset());
+ result.setType(SYNONYM_TOKEN_TYPE);
+ result.setPositionIncrement(0);
+ result.setPayload(current.getPayload());
+ result.setFlags(current.getFlags());
+ return result;
}
/**
Index: contrib/lucli/src/java/lucli/LuceneMethods.java
===================================================================
--- contrib/lucli/src/java/lucli/LuceneMethods.java (revision 683439)
+++ contrib/lucli/src/java/lucli/LuceneMethods.java (working copy)
@@ -279,6 +279,7 @@
Analyzer analyzer = new StandardAnalyzer();
Enumeration fields = doc.fields();
+ Token reusableToken = new Token();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
String fieldName = field.name();
@@ -299,10 +300,11 @@
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
- for (Token t = stream.next(); t != null; t = stream.next()) {
- position += (t.getPositionIncrement() - 1);
+ Token token = reusableToken;
+ for (token = stream.next(token); token != null; token = stream.next(token)) {
+ position += (token.getPositionIncrement() - 1);
position++;
- String name = t.termText();
+ String name = token.term();
Integer Count = (Integer) tokenHash.get(name);
if (Count == null) { // not in there yet
tokenHash.put(name, new Integer(1)); //first one
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy)
@@ -33,12 +33,10 @@
{
String s = "a天b";
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
- Token token;
int correctStartOffset = 0;
int correctEndOffset = 1;
- while ((token = tokenizer.next()) != null)
- {
+ for (Token token = tokenizer.next(new Token()); token != null; token = tokenizer.next(token)) {
assertEquals(correctStartOffset, token.startOffset());
assertEquals(correctEndOffset, token.endOffset());
correctStartOffset++;
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy)
@@ -42,12 +42,13 @@
*/
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ Token token = new Token();
for (int i=0; i tokens;
- // test a plain old token stream with synonyms tranlated to rows.
+ // test a plain old token stream with synonyms translated to rows.
tokens = new LinkedList();
- tokens.add(new Token("please", 0, 6));
- tokens.add(new Token("divide", 7, 13));
- tokens.add(new Token("this", 14, 18));
- tokens.add(new Token("sentence", 19, 27));
- tokens.add(new Token("into", 28, 32));
- tokens.add(new Token("shingles", 33, 39));
+ tokens.add(createToken("please", 0, 6));
+ tokens.add(createToken("divide", 7, 13));
+ tokens.add(createToken("this", 14, 18));
+ tokens.add(createToken("sentence", 19, 27));
+ tokens.add(createToken("into", 28, 32));
+ tokens.add(createToken("shingles", 33, 39));
tls = new TokenListStream(tokens);
@@ -70,21 +64,23 @@
ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
- assertNext(ts, "please", 0, 6);
- assertNext(ts, "please divide", 0, 13);
- assertNext(ts, "divide", 7, 13);
- assertNext(ts, "divide this", 7, 18);
- assertNext(ts, "this", 14, 18);
- assertNext(ts, "this sentence", 14, 27);
- assertNext(ts, "sentence", 19, 27);
- assertNext(ts, "sentence into", 19, 32);
- assertNext(ts, "into", 28, 32);
- assertNext(ts, "into shingles", 28, 39);
- assertNext(ts, "shingles", 33, 39);
+ Token token = new Token();
+ assertNext(ts, token, "please", 0, 6);
+ assertNext(ts, token, "please divide", 0, 13);
+ assertNext(ts, token, "divide", 7, 13);
+ assertNext(ts, token, "divide this", 7, 18);
+ assertNext(ts, token, "this", 14, 18);
+ assertNext(ts, token, "this sentence", 14, 27);
+ assertNext(ts, token, "sentence", 19, 27);
+ assertNext(ts, token, "sentence into", 19, 32);
+ assertNext(ts, token, "into", 28, 32);
+ assertNext(ts, token, "into shingles", 28, 39);
+ assertNext(ts, token, "shingles", 33, 39);
- assertNull(ts.next());
+ assertNull(ts.next(token));
+
}
/**
@@ -95,9 +91,6 @@
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
- Token token = new Token(); // for debug use only
-
-
TokenStream ts;
TokenListStream tls;
LinkedList tokens;
@@ -117,25 +110,27 @@
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
- assertNext(ts, "hello_world");
- assertNext(ts, "greetings_world");
- assertNext(ts, "hello_earth");
- assertNext(ts, "greetings_earth");
- assertNext(ts, "hello_tellus");
- assertNext(ts, "greetings_tellus");
- assertNull(ts.next());
+ Token token = new Token();
+ assertNext(ts, token, "hello_world");
+ assertNext(ts, token, "greetings_world");
+ assertNext(ts, token, "hello_earth");
+ assertNext(ts, token, "greetings_earth");
+ assertNext(ts, token, "hello_tellus");
+ assertNext(ts, token, "greetings_tellus");
+ assertNull(ts.next(token));
// bi-grams with no spacer character, start offset, end offset
tls.reset();
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
- assertNext(ts, "helloworld", 0, 10);
- assertNext(ts, "greetingsworld", 0, 10);
- assertNext(ts, "helloearth", 0, 10);
- assertNext(ts, "greetingsearth", 0, 10);
- assertNext(ts, "hellotellus", 0, 10);
- assertNext(ts, "greetingstellus", 0, 10);
- assertNull(ts.next());
+ token = new Token();
+ assertNext(ts, token, "helloworld", 0, 10);
+ assertNext(ts, token, "greetingsworld", 0, 10);
+ assertNext(ts, token, "helloearth", 0, 10);
+ assertNext(ts, token, "greetingsearth", 0, 10);
+ assertNext(ts, token, "hellotellus", 0, 10);
+ assertNext(ts, token, "greetingstellus", 0, 10);
+ assertNull(ts.next(token));
// add ^_prefix_and_suffix_$
@@ -160,119 +155,122 @@
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);
//
-// while ((token = ts.next(token)) != null) {
-// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
- assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNull(ts.next());
+ token = new Token();
+ assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertNull(ts.next(token));
// test unlimited size and allow single boundary token as shingle
tls.reset();
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false);
//
-// while ((token = ts.next(token)) != null) {
-// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
- assertNext(ts, "^", 1, 10.0f, 0, 0);
- assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello", 1, 1.0f, 0, 4);
- assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "world", 1, 1.0f, 5, 10);
- assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "$", 1, 7.071068f, 10, 10);
- assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings", 1, 1.0f, 0, 4);
- assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "earth", 1, 1.0f, 5, 10);
- assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "tellus", 1, 1.0f, 5, 10);
- assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+ token = new Token();
+ assertNext(ts, token, "^", 1, 10.0f, 0, 0);
+ assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello", 1, 1.0f, 0, 4);
+ assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "world", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "$", 1, 7.071068f, 10, 10);
+ assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings", 1, 1.0f, 0, 4);
+ assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "earth", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "tellus", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
- assertNull(ts.next());
+ assertNull(ts.next(token));
// test unlimited size but don't allow single boundary token as shingle
tls.reset();
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true);
-// while ((token = ts.next(token)) != null) {
-// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
- assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello", 1, 1.0f, 0, 4);
- assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "world", 1, 1.0f, 5, 10);
- assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings", 1, 1.0f, 0, 4);
- assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "earth", 1, 1.0f, 5, 10);
- assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, "tellus", 1, 1.0f, 5, 10);
- assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+ token = new Token();
+ assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello", 1, 1.0f, 0, 4);
+ assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "world", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings", 1, 1.0f, 0, 4);
+ assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "earth", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, token, "tellus", 1, 1.0f, 5, 10);
+ assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
- assertNull(ts.next());
+ assertNull(ts.next(token));
System.currentTimeMillis();
@@ -300,27 +298,28 @@
ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);
-// while ((token = ts.next(token)) != null) {
-// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
// shingle, position increment, weight, start offset, end offset
- assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
- assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
- assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
- assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
- assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
- assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
- assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
+ token = new Token();
+ assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "greetings_and", 1, 1.4142135f, 0, 4);
+ assertNext(ts, token, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
+ assertNext(ts, token, "and_salutations", 1, 1.4142135f, 0, 4);
+ assertNext(ts, token, "and_salutations_world", 1, 1.7320508f, 0, 10);
+ assertNext(ts, token, "salutations_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "and_salutations_earth", 1, 1.7320508f, 0, 10);
+ assertNext(ts, token, "salutations_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, token, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
+ assertNext(ts, token, "salutations_tellus", 1, 1.4142135f, 0, 10);
- assertNull(ts.next());
+ assertNull(ts.next(token));
System.currentTimeMillis();
@@ -361,53 +360,53 @@
TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
-// Token token = new Token();
-// while ((token = ts.next(token)) != null) {
-// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
- assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
- assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
- assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
- assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
- assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
- assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
- assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
- assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
- assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
- assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
- assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
- assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
- assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
- assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
- assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
- assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
- assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
- assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
- assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
- assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
- assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
- assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
- assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
- assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
- assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
- assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
- assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
- assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
- assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
- assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
- assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
- assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
- assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
- assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
- assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
- assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
- assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
- assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
- assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
+ Token token = new Token();
+ assertNext(ts, token, "no_surprise", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "no_surprise_to", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "no_surprise_to_see", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "surprise_to", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "surprise_to_see", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "surprise_to_see_england", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "to_see", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "to_see_england", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "to_see_england_manager", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "see_england", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "see_england_manager", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "see_england_manager_svennis", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "england_manager", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "england_manager_svennis", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "england_manager_svennis_in", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "manager_svennis", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "manager_svennis_in", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "manager_svennis_in_the", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "svennis_in", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "svennis_in_the", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "svennis_in_the_croud", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "in_the", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "in_the_croud", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "the_croud", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "see_england_manager_sven", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "england_manager_sven", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "england_manager_sven_göran", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "manager_sven", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "manager_sven_göran", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "sven_göran", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "göran_eriksson", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
+ assertNext(ts, token, "eriksson_in", 1, 1.4142135f, 0, 0);
+ assertNext(ts, token, "eriksson_in_the", 1, 1.7320508f, 0, 0);
+ assertNext(ts, token, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
- assertNull(ts.next());
+ assertNull(ts.next(token));
}
@@ -417,11 +416,9 @@
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
- Token token = new Token();
- token.setTermText(text);
+ Token token = new Token(startOffset, endOffset);
+ token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
- token.setStartOffset(startOffset);
- token.setEndOffset(endOffset);
return token;
}
@@ -435,48 +432,44 @@
}
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
- Token token = new Token();
- token.setTermText(text);
+ Token token = new Token(startOffset, endOffset);
+ token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
- token.setStartOffset(startOffset);
- token.setEndOffset(endOffset);
return token;
}
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
- Token token = new Token();
- token.setTermText(text);
+ Token token = new Token(startOffset, endOffset);
+ token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
- token.setStartOffset(startOffset);
- token.setEndOffset(endOffset);
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
return token;
}
// assert-methods start here
- private Token assertNext(TokenStream ts, String text) throws IOException {
- Token token = ts.next(new Token());
+ private Token assertNext(TokenStream ts, Token token, String text) throws IOException {
+ ts.next(token);
assertNotNull(token);
- assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+ assertEquals(text, token.term());
return token;
}
- private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
- Token token = ts.next(new Token());
+ private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost) throws IOException {
+ token = ts.next(new Token());
assertNotNull(token);
- assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+ assertEquals(text, token.term());
assertEquals(positionIncrement, token.getPositionIncrement());
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
return token;
}
- private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
- Token token = ts.next(new Token());
+ private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
+ token = ts.next(token);
assertNotNull(token);
- assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+ assertEquals(text, token.term());
assertEquals(positionIncrement, token.getPositionIncrement());
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
assertEquals(startOffset, token.startOffset());
@@ -484,25 +477,31 @@
return token;
}
- private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
- Token token = ts.next(new Token());
+ private Token assertNext(TokenStream ts, Token token, String text, int startOffset, int endOffset) throws IOException {
+ token = ts.next(token);
assertNotNull(token);
- assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+ assertEquals(text, token.term());
assertEquals(startOffset, token.startOffset());
assertEquals(endOffset, token.endOffset());
return token;
}
+ private static Token createToken(String term, int start, int offset)
+ {
+ Token token = new Token(start, offset);
+ token.setTermBuffer(term);
+ return token;
+ }
+
public static class TokenListStream extends TokenStream {
private Collection tokens;
public TokenListStream(TokenStream ts) throws IOException {
tokens = new ArrayList();
- Token token;
- while ((token = ts.next(new Token())) != null) {
- tokens.add(token);
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ tokens.add((Token) token.clone());
}
}
@@ -512,14 +511,15 @@
private Iterator iterator;
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (iterator == null) {
iterator = tokens.iterator();
}
if (!iterator.hasNext()) {
return null;
}
- return iterator.next();
+ token = (Token) iterator.next();
+ return (Token) token.clone();
}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy)
@@ -36,13 +36,13 @@
throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
-
+ Token token = new Token();
for (int i = 0; i < output.length; i++) {
- Token t = ts.next();
- assertNotNull(t);
- assertEquals(t.termText(), output[i]);
+ token = ts.next(token);
+ assertNotNull(token);
+ assertEquals(token.term(), output[i]);
}
- assertNull(ts.next());
+ assertNull(ts.next(token));
ts.close();
}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (working copy)
@@ -43,10 +43,9 @@
String test = "The quick red fox jumped over the lazy brown dogs";
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
- Token tok = new Token();
boolean seenDogs = false;
- while ((tok = nptf.next(tok)) != null){
- if (tok.termText().equals("dogs")){
+ for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) {
+ if (tok.term().equals("dogs")){
seenDogs = true;
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
@@ -69,7 +68,7 @@
public Token next(Token result) throws IOException {
result = input.next(result);
- if (result != null && result.termText().equals("dogs")) {
+ if (result != null && result.term().equals("dogs")) {
result.setType("D");
}
return result;
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (working copy)
@@ -44,9 +44,8 @@
String test = "The quick red fox jumped over the lazy brown dogs";
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
- Token tok = new Token();
int count = 0;
- while ((tok = nptf.next(tok)) != null){
+ for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) {
assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0]))));
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
String type = new String(tok.getPayload().getData(), "UTF-8");
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (revision 683439)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (working copy)
@@ -42,9 +42,8 @@
String test = "The quick red fox jumped over the lazy brown dogs";
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
- Token tok = new Token();
int count = 0;
- while ((tok = nptf.next(tok)) != null){
+ for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) {
assertTrue("tok is null and it shouldn't be", tok != null);
Payload pay = tok.getPayload();
assertTrue("pay is null and it shouldn't be", pay != null);
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy)
@@ -105,12 +105,12 @@
return dict;
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (tokens.size() > 0) {
return (Token)tokens.removeFirst();
}
- Token token = input.next();
+ token = input.next(token);
if (token == null) {
return null;
}
@@ -145,9 +145,10 @@
protected final Token createToken(final int offset, final int length,
final Token prototype) {
- Token t = new Token(prototype.startOffset() + offset, prototype
- .startOffset()
- + offset + length, prototype.type());
+ Token t = (Token) prototype.clone();
+ int newStart = t.startOffset() + offset;
+ t.setStartOffset(newStart);
+ t.setEndOffset(newStart + length);
t.setTermBuffer(prototype.termBuffer(), offset, length);
t.setPositionIncrement(0);
return t;
@@ -155,7 +156,7 @@
protected void decompose(final Token token) {
// In any case we give the original token back
- tokens.add(token);
+ tokens.add((Token) token.clone());
// Only words longer than minWordSize get processed
if (token.termLength() < this.minWordSize) {
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy)
@@ -37,25 +37,19 @@
this.charset = charset;
}
- public final Token next() throws java.io.IOException
+ public final Token next(Token token) throws java.io.IOException
{
- Token t = input.next();
+ token = input.next(token);
- if (t == null)
+ if (token == null)
return null;
- String txt = t.termText();
-
- char[] chArray = txt.toCharArray();
- for (int i = 0; i < chArray.length; i++)
+ char[] chArray = token.termBuffer();
+ int chLen = token.termLength();
+ for (int i = 0; i < chLen; i++)
{
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
}
-
- String newTxt = new String(chArray);
- // create new token
- Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
-
- return newToken;
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy)
@@ -35,7 +35,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private RussianStemmer stemmer = null;
public RussianStemFilter(TokenStream in, char[] charset)
@@ -47,22 +46,17 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public final Token next() throws IOException
+ public final Token next(Token token) throws IOException
{
- if ((token = input.next()) == null)
+ if ((token = input.next(token)) == null)
{
return null;
}
- else
- {
- String s = stemmer.stem(token.termText());
- if (!s.equals(token.termText()))
- {
- return new Token(s, token.startOffset(), token.endOffset(),
- token.type());
- }
- return token;
- }
+ String term = token.term();
+ String s = stemmer.stem(term);
+ if (s != null && !s.equals(term))
+ token.setTermBuffer(s);
+ return token;
}
/**
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (working copy)
@@ -48,7 +48,7 @@
public void add(Token t) {
//check to see if this is a Category
if (t != null && typeToMatch.equals(t.type())){
- lst.add(t.clone());
+ super.add(t);
}
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (working copy)
@@ -73,10 +73,10 @@
//Check to see if this token is a date
if (t != null) {
try {
- Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
+ Date date = dateFormat.parse(t.term());//We don't care about the date, just that we can parse it as a date
if (date != null) {
t.setType(DATE_TYPE);
- lst.add(t.clone());
+ super.add(t);
}
} catch (ParseException e) {
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy)
@@ -37,7 +37,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
@@ -48,7 +47,7 @@
}
/**
- * Builds a GermanStemFilter that uses an exclusiontable.
+ * Builds a GermanStemFilter that uses an exclusion table.
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
@@ -59,25 +58,21 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public final Token next()
+ public final Token next(Token token)
throws IOException
{
- if ( ( token = input.next() ) == null ) {
+ if ( ( token = input.next(token) ) == null ) {
return null;
}
- // Check the exclusiontable
- else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
- return token;
+ String term = token.term();
+ // Check the exclusion table.
+ if (exclusionSet == null || !exclusionSet.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ token.setTermBuffer(s);
}
- else {
- String s = stemmer.stem( token.termText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.equals( token.termText() ) ) {
- return new Token( s, token.startOffset(),
- token.endOffset(), token.type() );
- }
- return token;
- }
+ return token;
}
/**
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy)
@@ -47,7 +47,7 @@
/**
* filler token for when positionIncrement is more than 1
*/
- public static final String FILLER_TOKEN = "_";
+ public static final char[] FILLER_TOKEN = { '_' };
/**
@@ -150,11 +150,11 @@
}
/* (non-Javadoc)
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
- public Token next() throws IOException {
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public Token next(Token token) throws IOException {
if (outputBuf.isEmpty()) {
- fillOutputBuf();
+ fillOutputBuf(token);
}
Token nextToken = null;
if ( ! outputBuf.isEmpty())
@@ -173,16 +173,19 @@
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
- private Token getNextToken() throws IOException {
+ private Token getNextToken(Token token) throws IOException {
if (tokenBuf.isEmpty()) {
- Token lastToken = input.next();
- if (lastToken != null) {
- for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
- tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
- lastToken.startOffset()));
+ token = input.next(token);
+ if (token != null) {
+ for (int i = 1; i < token.getPositionIncrement(); i++) {
+ Token fillerToken = (Token) token.clone();
+ // A filler token occupies no space
+ fillerToken.setEndOffset(fillerToken.startOffset());
+ fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ tokenBuf.add(fillerToken);
}
- tokenBuf.add(lastToken);
- return getNextToken();
+ tokenBuf.add(token.clone());
+ return getNextToken(token);
} else {
return null;
}
@@ -196,13 +199,13 @@
*
* @throws IOException if there's a problem getting the next token
*/
- private void fillOutputBuf() throws IOException {
+ private void fillOutputBuf(Token token) throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
- Token token = getNextToken();
+ token = getNextToken(token);
if (token != null) {
shingleBuf.add(token);
if (shingleBuf.size() > maxShingleSize)
@@ -235,17 +238,17 @@
}
int i = 0;
- Token token = null;
+ Token shingle = null;
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
- token = (Token) it.next();
+ shingle = (Token) it.next();
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
- shingles[j].append(token.termBuffer(), 0, token.termLength());
+ shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
}
- endOffsets[i] = token.endOffset();
+ endOffsets[i] = shingle.endOffset();
i++;
}
@@ -258,17 +261,26 @@
/*
* Push new tokens to the output buffer.
*/
+ if (!shingleBuf.isEmpty()) {
+ Token firstShingle = (Token) shingleBuf.get(0);
+ shingle = (Token) firstShingle.clone();
+ shingle.setType(tokenType);
+ }
for (int j = 1; j < shingleBuf.size(); j++) {
- Token shingle = new Token(shingles[j].toString(),
- ((Token) shingleBuf.get(0)).startOffset(),
- endOffsets[j],
- tokenType);
+ shingle.setEndOffset(endOffsets[j]);
+ StringBuffer buf = shingles[j];
+ int termLength = buf.length();
+ char[] termBuffer = shingle.termBuffer();
+ if (termBuffer.length < termLength)
+ termBuffer = shingle.resizeTermBuffer(termLength);
+ buf.getChars(0, termLength, termBuffer, 0);
+ shingle.setTermLength(termLength);
if ((! outputUnigrams) && j == 1) {
shingle.setPositionIncrement(1);
} else {
shingle.setPositionIncrement(0);
}
- outputBuf.add(shingle);
+ outputBuf.add(shingle.clone());
}
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy)
@@ -17,16 +17,23 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.index.Payload;
-import java.io.IOException;
-import java.util.*;
-
/**
* A ShingleFilter constructs shingles (token n-grams) from a token stream.
* In other words, it creates combinations of tokens as a single token.
@@ -340,14 +347,14 @@
}
// shingle token factory
- StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future.
+ StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
for (Token shingleToken : shingle) {
if (spacerCharacter != null && sb.length() > 0) {
sb.append(spacerCharacter);
}
sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
}
- token.setTermText(sb.toString());
+ token.setTermBuffer(sb.toString());
updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
return token;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy)
@@ -35,25 +35,19 @@
this.charset = charset;
}
- public final Token next() throws java.io.IOException
+ public final Token next(Token token) throws java.io.IOException
{
- Token t = input.next();
+ token = input.next(token);
- if (t == null)
+ if (token == null)
return null;
- String txt = t.termText();
-
- char[] chArray = txt.toCharArray();
- for (int i = 0; i < chArray.length; i++)
+ char[] chArray = token.termBuffer();
+ int chLen = token.termLength();
+ for (int i = 0; i < chLen; i++)
{
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
}
-
- String newTxt = new String(chArray);
- // create new token
- Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
-
- return newToken;
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy)
@@ -18,8 +18,11 @@
*/
import java.util.Hashtable;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
/**
* Title: ChineseFilter
* Description: Filter with a stop word table
@@ -61,10 +64,10 @@
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
}
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
- for (Token token = input.next(); token != null; token = input.next()) {
- String text = token.termText();
+ for (token = input.next(token); token != null; token = input.next(token)) {
+ String text = token.term();
// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) {
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy)
@@ -19,9 +19,11 @@
import java.io.Reader;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
/**
* Title: ChineseTokenizer
* Description: Extract tokens from the Stream using Character.getType()
@@ -75,17 +77,19 @@
}
- private final Token flush() {
+ private final Token flush(Token token) {
if (length>0) {
//System.out.println(new String(buffer, 0, length));
- return new Token(new String(buffer, 0, length), start, start+length);
+ token.clear();
+ token.setTermBuffer(buffer, 0, length);
+ return token;
}
else
return null;
}
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
length = 0;
start = offset;
@@ -101,7 +105,7 @@
bufferIndex = 0;
}
- if (dataLen == -1) return flush();
+ if (dataLen == -1) return flush(token);
else
c = ioBuffer[bufferIndex++];
@@ -112,20 +116,20 @@
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
- if (length == MAX_WORD_LEN) return flush();
+ if (length == MAX_WORD_LEN) return flush(token);
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
- return flush();
+ return flush(token);
}
push(c);
- return flush();
+ return flush(token);
default:
- if (length>0) return flush();
+ if (length>0) return flush(token);
break;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy)
@@ -28,11 +28,12 @@
public class SingleTokenTokenStream extends TokenStream {
private boolean exhausted = false;
+ // The token needs to be immutable, so work with clones!
private Token token;
public SingleTokenTokenStream(Token token) {
- this.token = token;
+ this.token = (Token) token.clone();
}
@@ -41,7 +42,7 @@
return null;
}
exhausted = true;
- return token;
+ return (Token) token.clone();
}
@@ -50,10 +51,10 @@
}
public Token getToken() {
- return token;
+ return (Token) token.clone();
}
public void setToken(Token token) {
- this.token = token;
+ this.token = (Token) token.clone();
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (working copy)
@@ -124,7 +124,6 @@
if (source.termBuffer() != null) {
setTermBuffer(source.termBuffer(), 0, source.termLength());
} else {
- setTermText(null);
setTermLength(0);
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy)
@@ -27,18 +27,7 @@
*/
public class EmptyTokenStream extends TokenStream {
- public Token next() throws IOException {
- return null;
- }
-
public Token next(Token result) throws IOException {
return null;
}
-
- public void reset() throws IOException {
- }
-
- public void close() throws IOException {
- }
-
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy)
@@ -36,7 +36,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private BrazilianStemmer stemmer = null;
private Set exclusions = null;
@@ -53,22 +52,22 @@
/**
* @return Returns the next token in the stream, or null at EOS.
*/
- public final Token next()
+ public final Token next(Token token)
throws IOException {
- if ((token = input.next()) == null) {
+ if ((token = input.next(token)) == null) {
return null;
}
- // Check the exclusiontable.
- else if (exclusions != null && exclusions.contains(token.termText())) {
- return token;
- } else {
- String s = stemmer.stem(token.termText());
- // If not stemmed, dont waste the time creating a new token.
- if ((s != null) && !s.equals(token.termText())) {
- return new Token(s, token.startOffset(), token.endOffset(), token.type());
- }
- return token;
+
+ String term = token.term();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ token.setTermBuffer(s);
}
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
@@ -64,7 +64,7 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
if (!started) {
started = true;
gramSize = minGram;
@@ -82,9 +82,12 @@
if (pos+gramSize > inLen)
return null;
}
- String gram = inStr.substring(pos, pos+gramSize);
+
int oldPos = pos;
pos++;
- return new Token(gram, oldPos, oldPos+gramSize);
+ token.setTermBuffer(inStr, pos, gramSize);
+ token.setStartOffset(oldPos);
+ token.setEndOffset(oldPos+gramSize);
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy)
@@ -115,15 +115,14 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
- Token token = input.next();
- if (token == null) {
+ token = input.next(token);
+ if (token == null)
return null;
- }
ngram(token);
if (ngrams.size() > 0)
@@ -133,12 +132,12 @@
}
private void ngram(Token token) {
- String inStr = token.termText();
- int inLen = inStr.length();
+ int termLength = token.termLength();
+ char[] termBuffer = token.termBuffer();
int gramSize = minGram;
while (gramSize <= maxGram) {
// if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen) {
+ if (gramSize > termLength) {
return;
}
@@ -147,13 +146,13 @@
return;
}
- Token tok;
- if (side == Side.FRONT) {
- tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
- }
- else {
- tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
- }
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : termLength - gramSize;
+ int end = start + gramSize;
+ Token tok = (Token) token.clone();
+ tok.setStartOffset(start);
+ tok.setEndOffset(end);
+ tok.setTermBuffer(termBuffer, start, gramSize);
ngrams.add(tok);
gramSize++;
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy)
@@ -19,6 +19,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import java.io.IOException;
import java.io.Reader;
@@ -113,13 +114,13 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
// if we are just starting, read the whole input
if (!started) {
started = true;
char[] chars = new char[1024];
input.read(chars);
- inStr = new String(chars).trim(); // remove any trailing empty strings
+ inStr = new String(chars).trim(); // remove any leading or trailing spaces
inLen = inStr.length();
gramSize = minGram;
}
@@ -134,15 +135,13 @@
return null;
}
- Token tok;
- if (side == Side.FRONT) {
- tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
- }
- else {
- tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
- }
-
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ token.setTermBuffer(inStr, start, gramSize);
+ token.setStartOffset(start);
+ token.setEndOffset(end);
gramSize++;
- return tok;
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy)
@@ -63,12 +63,12 @@
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
- Token token = input.next();
+ token = input.next(token);
if (token == null) {
return null;
}
@@ -81,15 +81,16 @@
}
private void ngram(Token token) {
- String inStr = token.termText();
- int inLen = inStr.length();
+ char[] termBuffer = token.termBuffer();
+ int termLength = token.termLength();
int gramSize = minGram;
while (gramSize <= maxGram) {
int pos = 0; // reset to beginning of string
- while (pos+gramSize <= inLen) { // while there is input
- String gram = inStr.substring(pos, pos+gramSize);
- Token tok = new Token(gram, pos, pos+gramSize);
-// tok.setPositionIncrement(pos);
+ while (pos+gramSize <= termLength) { // while there is input
+ Token tok = (Token)token.clone();
+ tok.setStartOffset(pos);
+ tok.setEndOffset(pos+gramSize);
+ tok.setTermBuffer(termBuffer, pos, gramSize);
ngrams.add(tok);
pos++;
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy)
@@ -37,12 +37,11 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private FrenchStemmer stemmer = null;
private Set exclusions = null;
public FrenchStemFilter( TokenStream in ) {
- super(in);
+ super(in);
stemmer = new FrenchStemmer();
}
@@ -55,23 +54,21 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public final Token next()
+ public final Token next(Token token)
throws IOException {
- if ( ( token = input.next() ) == null ) {
+ if ( ( token = input.next(token) ) == null ) {
return null;
}
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
- return token;
+ String term = token.term();
+
+ // Check the exclusion table
+ if ( exclusions == null || !exclusions.contains( term ) ) {
+ String s = stemmer.stem( term );
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals( term ) )
+ token.setTermBuffer(s);
}
- else {
- String s = stemmer.stem( token.termText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.equals( token.termText() ) ) {
- return new Token( s, token.startOffset(), token.endOffset(), token.type());
- }
- return token;
- }
+ return token;
}
/**
* Set a alternative/custom FrenchStemmer for this filter.
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy)
@@ -38,7 +38,7 @@
public class ElisionFilter extends TokenFilter {
private Set articles = null;
- private static String apostrophes = "'’";
+ private static char[] apostrophes = {'\'', '’'};
public void setArticles(Set articles) {
this.articles = new HashSet();
@@ -74,25 +74,35 @@
}
/**
- * Returns the next input Token whith termText() without elisioned start
+ * Returns the next input Token with term() without elisioned start
*/
- public Token next() throws IOException {
- Token t = input.next();
- if (t == null)
+ public Token next(Token token) throws IOException {
+ token = input.next(token);
+ if (token == null)
return null;
- String text = t.termText();
- System.out.println(text);
- int minPoz = -1;
- int poz;
- for (int i = 0; i < apostrophes.length(); i++) {
- poz = text.indexOf(apostrophes.charAt(i));
- if (poz != -1)
- minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz);
+
+ char[] termBuffer = token.termBuffer();
+ int termLength = token.termLength();
+
+ int minPoz = Integer.MAX_VALUE;
+ for (int i = 0; i < apostrophes.length; i++) {
+ char apos = apostrophes[i];
+ // The equivalent of String.indexOf(ch)
+ for (int poz = 0; poz < termLength ; poz++) {
+ if (termBuffer[poz] == apos) {
+ minPoz = Math.min(poz, minPoz);
+ break;
+ }
+ }
}
- if (minPoz != -1
- && articles.contains(text.substring(0, minPoz).toLowerCase()))
- text = text.substring(minPoz + 1);
- return new Token(text, t.startOffset(), t.endOffset(), t.type());
+
+ // An apostrophe has been found. If the prefix is an article strip it off.
+ if (minPoz != Integer.MAX_VALUE
+ && articles.contains(new String(token.termBuffer(), 0, minPoz).toLowerCase())) {
+ token.setTermBuffer(token.termBuffer(), minPoz + 1, token.termLength() - (minPoz + 1));
+ }
+
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy)
@@ -26,7 +26,7 @@
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
* most European languages. It performs other token methods for double-byte
- * Characters: the token will return at each two charactors with overlap match.
+ * Characters: the token will return at each two characters with overlap match.
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
* also need filter filter zero length token ""
* for Digit: digit, '+', '#' will token as letter
@@ -96,13 +96,14 @@
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
+ * @param token a reusable token
* @return Token
*
* @throws java.io.IOException - throw IOException when read error
- * hanppened in the InputStream
+ * happened in the InputStream
*
*/
- public final Token next() throws java.io.IOException {
+ public final Token next(Token token) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
int length = 0;
@@ -110,10 +111,10 @@
int start = offset;
while (true) {
- /** current charactor */
+ /** current character */
char c;
- /** unicode block of current charactor for detail */
+ /** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
@@ -198,7 +199,7 @@
}
}
} else {
- // non-ASCII letter, eg."C1C2C3C4"
+ // non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
@@ -236,8 +237,11 @@
}
}
- return new Token(new String(buffer, 0, length), start, start + length,
- tokenType
- );
+ token.clear();
+ token.setTermBuffer(buffer, 0, length);
+ token.setStartOffset(start);
+ token.setEndOffset(start + length);
+ token.setType(tokenType);
+ return token;
}
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy)
@@ -38,7 +38,6 @@
/**
* The actual token in the input stream.
*/
- private Token token = null;
private DutchStemmer stemmer = null;
private Set exclusions = null;
@@ -48,7 +47,7 @@
}
/**
- * Builds a DutchStemFilter that uses an exclusiontable.
+ * Builds a DutchStemFilter that uses an exclusion table.
*/
public DutchStemFilter(TokenStream _in, Set exclusiontable) {
this(_in);
@@ -66,23 +65,20 @@
/**
* @return Returns the next token in the stream, or null at EOS
*/
- public Token next() throws IOException {
- if ((token = input.next()) == null) {
+ public Token next(Token token) throws IOException {
+ if ((token = input.next(token)) == null) {
return null;
}
+ String term = token.term();
- // Check the exclusiontable
- else if (exclusions != null && exclusions.contains(token.termText())) {
- return token;
- } else {
- String s = stemmer.stem(token.termText());
- // If not stemmed, dont waste the time creating a new token
- if (!s.equals(token.termText())) {
- return new Token(s, token.startOffset(),
- token.endOffset(), token.type());
- }
- return token;
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ token.setTermBuffer(s);
}
+ return token;
}
/**
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 683439)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy)
@@ -40,31 +40,40 @@
breaker = BreakIterator.getWordInstance(new Locale("th"));
}
- public Token next() throws IOException {
+ public Token next(Token token) throws IOException {
if (thaiToken != null) {
- String text = thaiToken.termText();
int start = breaker.current();
int end = breaker.next();
if (end != BreakIterator.DONE) {
- return new Token(text.substring(start, end),
- thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
+ token.setTermBuffer(thaiToken.termBuffer(), start, end - start);
+ token.setStartOffset(thaiToken.startOffset()+start);
+ token.setEndOffset(thaiToken.endOffset()+end);
+ token.setType(thaiToken.type());
+ token.setPayload(thaiToken.getPayload());
+ token.setFlags(thaiToken.getFlags());
+ return token;
}
thaiToken = null;
}
- Token tk = input.next();
- if (tk == null) {
+
+ token = input.next(token);
+ if (token == null || token.termLength() == 0) {
return null;
}
- String text = tk.termText();
+
+ String text = token.term();
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
- return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
+ token.setTermBuffer(text.toLowerCase());
+ return token;
}
- thaiToken = tk;
+
+ thaiToken = (Token) token.clone();
breaker.setText(text);
int end = breaker.next();
if (end != BreakIterator.DONE) {
- return new Token(text.substring(0, end),
- thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
+ token.setTermBuffer(text, 0, end);
+ token.setEndOffset(token.startOffset() + end);
+ return token;
}
return null;
}
Index: contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (revision 683439)
+++ contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@@ -808,10 +809,10 @@
throws IOException
{
TokenStream ts = analyzer.tokenStream(fieldName, r);
- org.apache.lucene.analysis.Token token;
int tokenCount=0;
- while ((token = ts.next()) != null) { // for every token
- String word = token.termText();
+ // for every token
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ String word = token.term();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
@@ -872,7 +873,7 @@
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
- * @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
+ * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/
Index: contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (revision 683439)
+++ contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java (working copy)
@@ -21,6 +21,7 @@
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -85,12 +86,10 @@
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
- org.apache.lucene.analysis.Token t;
BooleanQuery tmp = new BooleanQuery();
Set already = new HashSet(); // ignore dups
- while ( (t = ts.next()) != null)
- {
- String word = t.termText();
+ for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ String word = token.term();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
Index: contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
===================================================================
--- contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (revision 683439)
+++ contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (working copy)
@@ -104,18 +104,20 @@
{
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
- Token token=ts.next();
+ Token token=new Token();
+ token = ts.next(token);
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects
HashSet processedTerms=new HashSet();
while(token!=null)
- {
- if(!processedTerms.contains(token.termText()))
+ {
+ String term = token.term();
+ if(!processedTerms.contains(term))
{
- processedTerms.add(token.termText());
+ processedTerms.add(term);
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
- Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
+ Term startTerm=internSavingTemplateTerm.createTerm(term);
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
TermEnum origEnum = reader.terms(startTerm);
int df=0;
@@ -162,7 +164,7 @@
q.insert(st);
}
}
- token=ts.next();
+ token=ts.next(token);
}
}