Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 683629) +++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy) @@ -147,29 +147,30 @@ super(in); } - public final org.apache.lucene.analysis.Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { if (multiToken > 0) { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token("multi"+(multiToken+1), prevToken.startOffset(), - prevToken.endOffset(), prevToken.type()); + String term = "multi"+(multiToken+1); + token.setTermBuffer(term.toCharArray(), 0, term.length()); + token.setStartOffset(prevToken.startOffset()); + token.setEndOffset(prevToken.endOffset()); + token.setType(prevToken.type()); + token.setPayload(prevToken.getPayload()); + token.setFlags(prevToken.getFlags()); token.setPositionIncrement(0); multiToken--; return token; } else { - org.apache.lucene.analysis.Token t = input.next(); - prevToken = t; - if (t == null) + token = input.next(token); + prevToken = token; + if (token == null) return null; - String text = t.termText(); + String text = new String(token.termBuffer(), 0, token.termLength()); if (text.equals("triplemulti")) { multiToken = 2; - return t; } else if (text.equals("multi")) { multiToken = 1; - return t; - } else { - return t; } + return token; } } } @@ -197,20 +198,14 @@ super(in); } - public final org.apache.lucene.analysis.Token next() throws java.io.IOException { - for (Token t = input.next(); t != null; t = input.next()) { - if (t.termText().equals("the")) { + public final Token next(Token token) throws java.io.IOException { + for (token = input.next(token); token != null; token = input.next(token)) { + if (token.termText().equals("the")) { // stopword, do nothing - } else if (t.termText().equals("quick")) { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(), - t.endOffset(), t.type()); + } else if (token.termText().equals("quick")) { token.setPositionIncrement(2); return token; } else { - org.apache.lucene.analysis.Token token = - new org.apache.lucene.analysis.Token(t.termText(), t.startOffset(), - t.endOffset(), t.type()); token.setPositionIncrement(1); return token; } Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java =================================================================== --- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 683629) +++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy) @@ -75,17 +75,21 @@ boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (inPhrase) { inPhrase = false; - return new Token("phrase2", savedStart, savedEnd); + token.setStartOffset(savedStart); + token.setEndOffset(savedEnd); + token.setTermBuffer("phrase2".toCharArray(), 0, "phrase2".length()); + return token; } else - for (Token token = input.next(); token != null; token = input.next()) { + for (token = input.next(token); token != null; token = input.next(token)) { if (token.termText().equals("phrase")) { inPhrase = true; savedStart = token.startOffset(); savedEnd = token.endOffset(); - return new Token("phrase1", savedStart, savedEnd); + token.setTermBuffer("phrase1".toCharArray(), 0, "phrase1".length()); + return token; } else if (!token.termText().equals("stop")) return token; } Index: src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java =================================================================== --- src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestPerFieldAnalzyerWrapper.java (working copy) @@ -29,14 +29,14 @@ TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); - Token token = tokenStream.next(); + Token token = tokenStream.next(new Token()); assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", token.termText()); tokenStream = analyzer.tokenStream("special", new StringReader(text)); - token = tokenStream.next(); + token = tokenStream.next(token); assertEquals("SimpleAnalyzer lowercases", "qwerty", token.termText()); Index: src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java =================================================================== --- src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java (working copy) @@ -69,16 +69,15 @@ } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); - Token token = null; int i = 0; - while ((token = source.next()) != null) { + for (Token token = source.next(new Token()); token != null; token = source.next(token)) { assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; - while ((token = sink1.next()) != null) { + for (Token token = source.next(new Token()); token != null; token = source.next(token)) { assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true); i++; } @@ -102,9 +101,8 @@ }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); - Token token = null; int i = 0; - while ((token = source1.next()) != null) { + for (Token token = source1.next(new Token()); token != null; token = source1.next(token)) { assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true); i++; } @@ -112,7 +110,7 @@ assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; - while ((token = source2.next()) != null) { + for (Token token = source2.next(new Token()); token != null; token = source2.next(token)) { assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true); i++; } @@ -120,13 +118,13 @@ assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; - while ((token = theDetector.next()) != null) { + for (Token token = theDetector.next(new Token()); token != null; token = theDetector.next(token)) { assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; - while ((token = dogDetector.next()) != null) { + for (Token token = dogDetector.next(new Token()); token != null; token = dogDetector.next(token)) { assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true); i++; } @@ -134,7 +132,7 @@ source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; - while ((token = lowerCasing.next()) != null) { + for (Token token = lowerCasing.next(new Token()); token != null; token = lowerCasing.next(token)) { assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true); i++; } Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy) @@ -42,11 +42,16 @@ TokenStream stream = new TokenStream() { private int index = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (index == tokens.length) { return null; } else { - return new Token(tokens[index++], 0, 0); + token.clear(); + token.setTermText(tokens[index++]); + token.setStartOffset(0); + token.setEndOffset(0); + token.setType(Token.DEFAULT_TYPE); + return token; } } @@ -91,8 +96,7 @@ private void checkTokens(TokenStream stream) throws IOException { int count = 0; - Token token; - while ((token = stream.next()) != null) { + for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) { assertTrue(count < tokens.length); assertEquals(tokens[count], token.termText()); count++; Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy) @@ -35,8 +35,9 @@ public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i = 0; i < expectedImages.length; i++) { - Token t = ts.next(); + t = ts.next(t); assertNotNull(t); assertEquals(expectedImages[i], t.termText()); if (expectedTypes != null) { @@ -46,7 +47,7 @@ assertEquals(expectedPosIncrs[i], t.getPositionIncrement()); } } - assertNull(ts.next()); + assertNull(ts.next(t)); ts.close(); } Index: src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (working copy) @@ -25,81 +25,82 @@ public void testU() throws Exception { TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl")); ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream); - assertEquals("Des", filter.next().termText()); - assertEquals("mot", filter.next().termText()); - assertEquals("cles", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("LA", filter.next().termText()); - assertEquals("CHAINE", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("A", filter.next().termText()); - assertEquals("AE", filter.next().termText()); - assertEquals("C", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("E", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("I", filter.next().termText()); - assertEquals("IJ", filter.next().termText()); - assertEquals("D", filter.next().termText()); - assertEquals("N", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("O", filter.next().termText()); - assertEquals("OE", filter.next().termText()); - assertEquals("TH", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("U", filter.next().termText()); - assertEquals("Y", filter.next().termText()); - assertEquals("Y", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("a", filter.next().termText()); - assertEquals("ae", filter.next().termText()); - assertEquals("c", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("e", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("i", filter.next().termText()); - assertEquals("ij", filter.next().termText()); - assertEquals("d", filter.next().termText()); - assertEquals("n", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("o", filter.next().termText()); - assertEquals("oe", filter.next().termText()); - assertEquals("ss", filter.next().termText()); - assertEquals("th", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("u", filter.next().termText()); - assertEquals("y", filter.next().termText()); - assertEquals("y", filter.next().termText()); - assertEquals("fi", filter.next().termText()); - assertEquals("fl", filter.next().termText()); - assertNull(filter.next()); + Token token = new Token(); + assertEquals("Des", filter.next(token).termText()); + assertEquals("mot", filter.next(token).termText()); + assertEquals("cles", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("LA", filter.next(token).termText()); + assertEquals("CHAINE", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("A", filter.next(token).termText()); + assertEquals("AE", filter.next(token).termText()); + assertEquals("C", filter.next(token).termText()); + assertEquals("E", filter.next(token).termText()); + assertEquals("E", filter.next(token).termText()); + assertEquals("E", filter.next(token).termText()); + assertEquals("E", filter.next(token).termText()); + assertEquals("I", filter.next(token).termText()); + assertEquals("I", filter.next(token).termText()); + assertEquals("I", filter.next(token).termText()); + assertEquals("I", filter.next(token).termText()); + assertEquals("IJ", filter.next(token).termText()); + assertEquals("D", filter.next(token).termText()); + assertEquals("N", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("O", filter.next(token).termText()); + assertEquals("OE", filter.next(token).termText()); + assertEquals("TH", filter.next(token).termText()); + assertEquals("U", filter.next(token).termText()); + assertEquals("U", filter.next(token).termText()); + assertEquals("U", filter.next(token).termText()); + assertEquals("U", filter.next(token).termText()); + assertEquals("Y", filter.next(token).termText()); + assertEquals("Y", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("a", filter.next(token).termText()); + assertEquals("ae", filter.next(token).termText()); + assertEquals("c", filter.next(token).termText()); + assertEquals("e", filter.next(token).termText()); + assertEquals("e", filter.next(token).termText()); + assertEquals("e", filter.next(token).termText()); + assertEquals("e", filter.next(token).termText()); + assertEquals("i", filter.next(token).termText()); + assertEquals("i", filter.next(token).termText()); + assertEquals("i", filter.next(token).termText()); + assertEquals("i", filter.next(token).termText()); + assertEquals("ij", filter.next(token).termText()); + assertEquals("d", filter.next(token).termText()); + assertEquals("n", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("o", filter.next(token).termText()); + assertEquals("oe", filter.next(token).termText()); + assertEquals("ss", filter.next(token).termText()); + assertEquals("th", filter.next(token).termText()); + assertEquals("u", filter.next(token).termText()); + assertEquals("u", filter.next(token).termText()); + assertEquals("u", filter.next(token).termText()); + assertEquals("u", filter.next(token).termText()); + assertEquals("y", filter.next(token).termText()); + assertEquals("y", filter.next(token).termText()); + assertEquals("fi", filter.next(token).termText()); + assertEquals("fl", filter.next(token).termText()); + assertNull(filter.next(token)); } } Index: src/test/org/apache/lucene/analysis/TestLengthFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy) @@ -27,10 +27,11 @@ TokenStream stream = new WhitespaceTokenizer( new StringReader("short toolong evenmuchlongertext a ab toolong foo")); LengthFilter filter = new LengthFilter(stream, 2, 6); - assertEquals("short", filter.next().termText()); - assertEquals("ab", filter.next().termText()); - assertEquals("foo", filter.next().termText()); - assertNull(filter.next()); + Token token = new Token(); + assertEquals("short", filter.next(token).termText()); + assertEquals("ab", filter.next(token).termText()); + assertEquals("foo", filter.next(token).termText()); + assertNull(filter.next(token)); } } Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 683629) +++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -35,12 +35,13 @@ String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled")); stpf.setEnablePositionIncrements(enableIcrements); + Token t = new Token(); for (int i=0; i<20; i+=3) { - Token t = stpf.next(); + t = stpf.next(t); log("Token "+i+": "+t); String w = English.intToEnglish(i).trim(); assertEquals("expecting token "+i+" to be "+w,w,t.termText()); assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement()); } - assertNull(stpf.next()); + assertNull(stpf.next(t)); } // print debug info depending on VERBOSE Index: src/test/org/apache/lucene/AnalysisTest.java =================================================================== --- src/test/org/apache/lucene/AnalysisTest.java (revision 683629) +++ src/test/org/apache/lucene/AnalysisTest.java (working copy) @@ -70,11 +70,11 @@ Date start = new Date(); int count = 0; - for (Token t = stream.next(); t!=null; t = stream.next()) { + for (Token t = stream.next(new Token()); t != null; t = stream.next(t)) { if (verbose) { - System.out.println("Text=" + new String(t.termBuffer(), 0, t.termLength()) - + " start=" + t.startOffset() - + " end=" + t.endOffset()); + System.out.println("Text=" + new String(t.termBuffer(), 0, t.termLength()) + + " start=" + t.startOffset() + + " end=" + t.endOffset()); } count++; } Index: src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 683629) +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -205,9 +205,7 @@ TokenStream ts = analyzer.tokenStream("field", new StringReader("one two three four five")); - while (true) { - Token token = ts.next(); - if (token == null) break; + for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { assertEquals(token.termText(), 1, token.getPositionIncrement()); } } Index: src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java =================================================================== --- src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (revision 683629) +++ src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (working copy) @@ -62,22 +62,22 @@ this.fieldName = fieldName; } - public Token next() throws IOException { - Token result = input.next(); - if (result != null) { + public Token next(Token token) throws IOException { + token = input.next(token); + if (token != null) { if (fieldName.equals("field")) { - result.setPayload(new Payload(payloadField)); + token.setPayload(new Payload(payloadField)); } else if (fieldName.equals("multiField")) { if (numSeen % 2 == 0) { - result.setPayload(new Payload(payloadMultiField1)); + token.setPayload(new Payload(payloadMultiField1)); } else { - result.setPayload(new Payload(payloadMultiField2)); + token.setPayload(new Payload(payloadMultiField2)); } numSeen++; } } - return result; + return token; } } Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 683629) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -1786,11 +1786,11 @@ return new TokenFilter(new StandardTokenizer(reader)) { private int count = 0; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (count++ == 5) { throw new IOException(); } - return input.next(); + return input.next(token); } }; } Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 683629) +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy) @@ -103,12 +103,12 @@ super(input); } - public Token next() throws IOException { - Token t = input.next(); - if (t != null) { - t.setPayload(new Payload(new byte[] { (byte) count++ })); + public Token next(Token token) throws IOException { + token = input.next(token); + if (token != null) { + token.setPayload(new Payload(new byte[] { (byte) count++ })); } - return t; + return token; } } Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 683629) +++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy) @@ -40,11 +40,11 @@ super(input); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (cache == null) { // fill cache lazily cache = new LinkedList(); - fillCache(); + fillCache(token); iterator = cache.iterator(); } @@ -52,20 +52,20 @@ // the cache is exhausted, return null return null; } - - return (Token) iterator.next(); + // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. + Token t = (Token) iterator.next(); + return (Token) t.clone(); } public void reset() throws IOException { if(cache != null) { - iterator = cache.iterator(); + iterator = cache.iterator(); } } - private void fillCache() throws IOException { - Token token; - while ( (token = input.next()) != null) { - cache.add(token); + private void fillCache(Token token) throws IOException { + for (token = input.next(token); token != null; token = input.next(token)) { + cache.add(token.clone()); } } Index: contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java =================================================================== --- contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (revision 683629) +++ contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java (working copy) @@ -66,12 +66,13 @@ String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i = 0; i < output.length; i++) { - Token t = ts.next(); + t = ts.next(t); assertNotNull(t); assertEquals(output[i], t.termText()); } - assertNull(ts.next()); + assertNull(ts.next(t)); ts.close(); } @@ -88,14 +89,14 @@ SnowballFilter filter = new SnowballFilter( new TokenStream() { - public Token next() { + public Token next(Token token) { return tok; } }, "English" ); - Token newtok = filter.next(); + Token newtok = filter.next(new Token()); assertEquals("accent", newtok.termText()); assertEquals(2, newtok.startOffset()); Index: contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java =================================================================== --- contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 683629) +++ contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy) @@ -60,20 +60,27 @@ } /** Returns the next input Token, after being stemmed */ - public final Token next() throws IOException { - Token token = input.next(); + public final Token next(Token token) throws IOException { + token = input.next(token); if (token == null) return null; - stemmer.setCurrent(token.termText()); + String originalTerm = new String(token.termBuffer(), 0, token.termLength()); + stemmer.setCurrent(originalTerm); try { stemMethod.invoke(stemmer, EMPTY_ARGS); } catch (Exception e) { throw new RuntimeException(e.toString()); } - - Token newToken = new Token(stemmer.getCurrent(), - token.startOffset(), token.endOffset(), token.type()); - newToken.setPositionIncrement(token.getPositionIncrement()); - return newToken; + String finalTerm = stemmer.getCurrent(); + // Don't bother updating, if it is unchanged. + if (!originalTerm.equals(finalTerm)) { + int termLength = finalTerm.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + finalTerm.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); + } + return token; } } Index: contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (revision 683629) +++ contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java (working copy) @@ -197,9 +197,8 @@ private List getTokens(TokenStream stream) throws IOException { ArrayList tokens = new ArrayList(); - Token token; - while ((token = stream.next()) != null) { - tokens.add(token); + for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) { + tokens.add(token.clone()); } return tokens; } Index: contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 683629) +++ contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -349,9 +349,7 @@ HashMap terms = new HashMap(); int numTokens = 0; int pos = -1; - Token token; - - while ((token = stream.next()) != null) { + for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) { String term = token.termText(); if (term.length() == 0) continue; // nothing to do // if (DEBUG) System.err.println("token='" + term + "'"); Index: contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (revision 683629) +++ contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java (working copy) @@ -73,8 +73,8 @@ return new TokenFilter(child.tokenStream(fieldName, reader)) { private int position = -1; - public Token next() throws IOException { - Token token = input.next(); // from filter super class + public Token next(Token token) throws IOException { + token = input.next(token); // from filter super class log.println(toString(token)); return token; } @@ -121,8 +121,8 @@ return new TokenFilter(child.tokenStream(fieldName, reader)) { private int todo = maxTokens; - public Token next() throws IOException { - return --todo >= 0 ? input.next() : null; + public Token next(Token token) throws IOException { + return --todo >= 0 ? input.next(token) : null; } }; } @@ -239,9 +239,9 @@ final ArrayList tokens2 = new ArrayList(); TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) { - public Token next() throws IOException { - Token token = input.next(); // from filter super class - if (token != null) tokens2.add(token); + public Token next(Token token) throws IOException { + token = input.next(token); // from filter super class + if (token != null) tokens2.add(token.clone()); return token; } }; @@ -300,8 +300,7 @@ HashMap map = new HashMap(); TokenStream stream = analyzer.tokenStream("", new StringReader(text)); try { - Token token; - while ((token = stream.next()) != null) { + for (Token token = stream.next(new Token()); token != null; token = stream.next(token)) { MutableInteger freq = (MutableInteger) map.get(token.termText()); if (freq == null) { freq = new MutableInteger(1); Index: contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java =================================================================== --- contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (revision 683629) +++ contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java (working copy) @@ -68,20 +68,16 @@ } /** Returns the next token in the stream, or null at EOS. */ - public Token next() throws IOException { - Token token; + public Token next(Token token) throws IOException { while (todo > 0 && index < stack.length) { // pop from stack - token = createToken(stack[index++], current); - if (token != null) { - todo--; - return token; - } + todo--; + return createToken(stack[index++], current, token); } - token = input.next(); + token = input.next(token); if (token == null) return null; // EOS; iterator exhausted - - stack = synonyms.getSynonyms(token.termText()); // push onto stack + + stack = synonyms.getSynonyms(new String(token.termBuffer(), 0, token.termLength())); // push onto stack if (stack.length > maxSynonyms) randomize(stack); index = 0; current = token; @@ -97,14 +93,25 @@ * a synonym for the current token's term * @param current * the current token from the underlying child stream + * @param result + * the reusable token to fill in * @return a new token, or null to indicate that the given synonym should be * ignored */ - protected Token createToken(String synonym, Token current) { - Token token = new Token( - synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE); - token.setPositionIncrement(0); - return token; + protected Token createToken(String synonym, Token current, Token result) { + int synonymLength = synonym.length(); + char[] termBuffer = result.termBuffer(); + if (termBuffer.length < synonymLength) + termBuffer = result.resizeTermBuffer(synonymLength); + synonym.getChars(0, synonymLength, termBuffer, 0); + result.setTermLength(synonymLength); + result.setStartOffset(current.startOffset()); + result.setEndOffset(current.endOffset()); + result.setType(SYNONYM_TOKEN_TYPE); + result.setPositionIncrement(0); + result.setPayload(current.getPayload()); + result.setFlags(current.getFlags()); + return result; } /** Index: contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 683629) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy) @@ -42,12 +42,13 @@ */ private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i=0; i tokens; @@ -70,20 +65,21 @@ ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); - assertNext(ts, "please", 0, 6); - assertNext(ts, "please divide", 0, 13); - assertNext(ts, "divide", 7, 13); - assertNext(ts, "divide this", 7, 18); - assertNext(ts, "this", 14, 18); - assertNext(ts, "this sentence", 14, 27); - assertNext(ts, "sentence", 19, 27); - assertNext(ts, "sentence into", 19, 32); - assertNext(ts, "into", 28, 32); - assertNext(ts, "into shingles", 28, 39); - assertNext(ts, "shingles", 33, 39); + Token token = new Token(); + assertNext(ts, token, "please", 0, 6); + assertNext(ts, token, "please divide", 0, 13); + assertNext(ts, token, "divide", 7, 13); + assertNext(ts, token, "divide this", 7, 18); + assertNext(ts, token, "this", 14, 18); + assertNext(ts, token, "this sentence", 14, 27); + assertNext(ts, token, "sentence", 19, 27); + assertNext(ts, token, "sentence into", 19, 32); + assertNext(ts, token, "into", 28, 32); + assertNext(ts, token, "into shingles", 28, 39); + assertNext(ts, token, "shingles", 33, 39); - assertNull(ts.next()); + assertNull(ts.next(token)); } @@ -95,9 +91,6 @@ ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); - Token token = new Token(); // for debug use only - - TokenStream ts; TokenListStream tls; LinkedList tokens; @@ -117,25 +110,27 @@ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "hello_world"); - assertNext(ts, "greetings_world"); - assertNext(ts, "hello_earth"); - assertNext(ts, "greetings_earth"); - assertNext(ts, "hello_tellus"); - assertNext(ts, "greetings_tellus"); - assertNull(ts.next()); + Token token = new Token(); + assertNext(ts, token, "hello_world"); + assertNext(ts, token, "greetings_world"); + assertNext(ts, token, "hello_earth"); + assertNext(ts, token, "greetings_earth"); + assertNext(ts, token, "hello_tellus"); + assertNext(ts, token, "greetings_tellus"); + assertNull(ts.next(token)); // bi-grams with no spacer character, start offset, end offset tls.reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "helloworld", 0, 10); - assertNext(ts, "greetingsworld", 0, 10); - assertNext(ts, "helloearth", 0, 10); - assertNext(ts, "greetingsearth", 0, 10); - assertNext(ts, "hellotellus", 0, 10); - assertNext(ts, "greetingstellus", 0, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "helloworld", 0, 10); + assertNext(ts, token, "greetingsworld", 0, 10); + assertNext(ts, token, "helloearth", 0, 10); + assertNext(ts, token, "greetingsearth", 0, 10); + assertNext(ts, token, "hellotellus", 0, 10); + assertNext(ts, token, "greetingstellus", 0, 10); + assertNull(ts.next(token)); // add ^_prefix_and_suffix_$ @@ -165,18 +160,19 @@ // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNull(ts.next(token)); // test unlimited size and allow single boundary token as shingle tls.reset(); @@ -187,46 +183,46 @@ // System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } + token = new Token(); + assertNext(ts, token, "^", 1, 10.0f, 0, 0); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "$", 1, 7.071068f, 10, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^", 1, 10.0f, 0, 0); - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "$", 1, 7.071068f, 10, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + assertNull(ts.next(token)); - assertNull(ts.next()); - // test unlimited size but don't allow single boundary token as shingle tls.reset(); @@ -236,43 +232,44 @@ // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); System.currentTimeMillis(); @@ -306,22 +303,22 @@ // } // shingle, position increment, weight, start offset, end offset + token = new Token(); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_and", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "greetings_and_salutations", 1, 1.7320508f, 0, 4); + assertNext(ts, token, "and_salutations", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "and_salutations_world", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_earth", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_tellus", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); - assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); - assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); - assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); + assertNull(ts.next(token)); - assertNull(ts.next()); - System.currentTimeMillis(); @@ -366,49 +363,49 @@ // System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } + Token token = new Token(); + assertNext(ts, token, "no_surprise", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "no_surprise_to", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "no_surprise_to_see", 1, 2.0f, 0, 0); + assertNext(ts, token, "surprise_to", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "surprise_to_see", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "surprise_to_see_england", 1, 2.0f, 0, 0); + assertNext(ts, token, "to_see", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "to_see_england", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "to_see_england_manager", 1, 2.0f, 0, 0); + assertNext(ts, token, "see_england", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "see_england_manager_svennis", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "england_manager_svennis", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_svennis_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_svennis", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_svennis_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_svennis_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "svennis_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "svennis_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "svennis_in_the_croud", 1, 2.0f, 0, 0); + assertNext(ts, token, "in_the", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "in_the_croud", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "the_croud", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager_sven", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager_sven", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_sven_göran", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_sven", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_sven_göran", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); + assertNext(ts, token, "sven_göran", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "göran_eriksson", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "göran_eriksson_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "göran_eriksson_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "eriksson_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "eriksson_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "eriksson_in_the_croud", 1, 2.0f, 0, 0); - assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0); - assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0); - assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0); - assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0); - assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0); - assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0); - assertNext(ts, "to_see", 1, 1.4142135f, 0, 0); - assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0); - assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0); - assertNext(ts, "see_england", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0); - assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0); - assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0); - assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0); - assertNext(ts, "in_the", 1, 1.4142135f, 0, 0); - assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0); - assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0); - assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); - assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0); - assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); - assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); - assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0); - assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0); + assertNull(ts.next(token)); - assertNull(ts.next()); - } private Token tokenFactory(String text, int startOffset, int endOffset) { @@ -457,15 +454,15 @@ // assert-methods start here - private Token assertNext(TokenStream ts, String text) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text) throws IOException { + token = ts.next(token); assertNotNull(token); assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost) throws IOException { + token = ts.next(token); assertNotNull(token); assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); assertEquals(positionIncrement, token.getPositionIncrement()); @@ -473,8 +470,8 @@ return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); assertEquals(positionIncrement, token.getPositionIncrement()); @@ -484,8 +481,8 @@ return token; } - private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); assertEquals(startOffset, token.startOffset()); @@ -500,9 +497,8 @@ public TokenListStream(TokenStream ts) throws IOException { tokens = new ArrayList(); - Token token; - while ((token = ts.next(new Token())) != null) { - tokens.add(token); + for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { + tokens.add((Token) token.clone()); } } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 683629) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy) @@ -37,12 +37,13 @@ TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token t = new Token(); for (int i = 0; i < output.length; i++) { - Token t = ts.next(); + t = ts.next(t); assertNotNull(t); assertEquals(t.termText(), output[i]); } - assertNull(ts.next()); + assertNull(ts.next(t)); ts.close(); } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -105,12 +105,12 @@ return dict; } - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (tokens.size() > 0) { return (Token)tokens.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -145,9 +145,10 @@ protected final Token createToken(final int offset, final int length, final Token prototype) { - Token t = new Token(prototype.startOffset() + offset, prototype - .startOffset() - + offset + length, prototype.type()); + Token t = (Token) prototype.clone(); + int newStart = t.startOffset() + offset; + t.setStartOffset(newStart); + t.setEndOffset(newStart + length); t.setTermBuffer(prototype.termBuffer(), offset, length); t.setPositionIncrement(0); return t; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -37,25 +37,19 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -32,10 +32,6 @@ */ public final class RussianStemFilter extends TokenFilter { - /** - * The actual token in the input stream. - */ - private Token token = null; private RussianStemmer stemmer = null; public RussianStemFilter(TokenStream in, char[] charset) @@ -47,22 +43,25 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() throws IOException - { - if ((token = input.next()) == null) - { - return null; - } - else - { - String s = stemmer.stem(token.termText()); - if (!s.equals(token.termText())) - { - return new Token(s, token.startOffset(), token.endOffset(), - token.type()); - } - return token; - } + public final Token next(Token token) throws IOException { + token = input.next(token); + + if (token == null) + return null; + + String term = new String(token.termBuffer(), 0, token.termLength()); + + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) { + int termLength = s.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + s.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); + } + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -34,10 +34,6 @@ */ public final class GermanStemFilter extends TokenFilter { - /** - * The actual token in the input stream. - */ - private Token token = null; private GermanStemmer stemmer = null; private Set exclusionSet = null; @@ -59,25 +55,28 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() - throws IOException - { - if ( ( token = input.next() ) == null ) { + public final Token next(Token token) throws IOException { + token = input.next(token); + + if (token == null) return null; - } - // Check the exclusiontable - else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { - return token; - } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); + + String term = new String(token.termBuffer(), 0, token.termLength()); + + // Check the exclusion table. + if (exclusionSet == null || !exclusionSet.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) { + int termLength = s.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + s.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); } - return token; } + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -47,7 +47,7 @@ /** * filler token for when positionIncrement is more than 1 */ - public static final String FILLER_TOKEN = "_"; + public static final char[] FILLER_TOKEN = { '_' }; /** @@ -152,9 +152,9 @@ /* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#next() */ - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (outputBuf.isEmpty()) { - fillOutputBuf(); + fillOutputBuf(token); } Token nextToken = null; if ( ! outputBuf.isEmpty()) @@ -173,22 +173,23 @@ * @return the next token, or null if at end of input stream * @throws IOException if the input stream has a problem */ - private Token getNextToken() throws IOException { - if (tokenBuf.isEmpty()) { - Token lastToken = input.next(); - if (lastToken != null) { - for (int i = 1; i < lastToken.getPositionIncrement(); i++) { - tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(), - lastToken.startOffset())); - } - tokenBuf.add(lastToken); - return getNextToken(); - } else { - return null; - } - } else { + private Token getNextToken(Token token) throws IOException { + if (!tokenBuf.isEmpty()) return (Token)tokenBuf.remove(0); + + token = input.next(token); + if (token == null) + return null; + + for (int i = 1; i < token.getPositionIncrement(); i++) { + Token fillerToken = (Token) token.clone(); + // A filler token occupies no space + fillerToken.setEndOffset(fillerToken.startOffset()); + fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length); + tokenBuf.add(fillerToken); } + tokenBuf.add(token.clone()); + return getNextToken(token); } /** @@ -196,15 +197,15 @@ * * @throws IOException if there's a problem getting the next token */ - private void fillOutputBuf() throws IOException { + private void fillOutputBuf(Token token) throws IOException { boolean addedToken = false; /* * Try to fill the shingle buffer. */ do { - Token token = getNextToken(); + token = getNextToken(token); if (token != null) { - shingleBuf.add(token); + shingleBuf.add(token.clone()); if (shingleBuf.size() > maxShingleSize) { shingleBuf.remove(0); @@ -235,7 +236,7 @@ } int i = 0; - Token token = null; + Token shingle = null; for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) { token = (Token) it.next(); for (int j = i; j < shingles.length; j++) { @@ -258,17 +259,26 @@ /* * Push new tokens to the output buffer. */ + if (!shingleBuf.isEmpty()) { + Token firstShingle = (Token) shingleBuf.get(0); + shingle = (Token) firstShingle.clone(); + shingle.setType(tokenType); + } for (int j = 1; j < shingleBuf.size(); j++) { - Token shingle = new Token(shingles[j].toString(), - ((Token) shingleBuf.get(0)).startOffset(), - endOffsets[j], - tokenType); + shingle.setEndOffset(endOffsets[j]); + StringBuffer buf = shingles[j]; + int termLength = buf.length(); + char[] termBuffer = shingle.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = shingle.resizeTermBuffer(termLength); + buf.getChars(0, termLength, termBuffer, 0); + shingle.setTermLength(termLength); if ((! outputUnigrams) && j == 1) { shingle.setPositionIncrement(1); } else { shingle.setPositionIncrement(0); } - outputBuf.add(shingle); + outputBuf.add(shingle.clone()); } } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -35,25 +35,19 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy) @@ -18,8 +18,11 @@ */ import java.util.Hashtable; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + /** * Title: ChineseFilter * Description: Filter with a stop word table @@ -61,35 +64,35 @@ stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); } - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { - for (Token token = input.next(); token != null; token = input.next()) { - String text = token.termText(); + for (token = input.next(token); token != null; token = input.next(token)) { + String text = new String(token.termBuffer(), 0, token.termLength()); - // why not key off token type here assuming ChineseTokenizer comes first? - if (stopTable.get(text) == null) { - switch (Character.getType(text.charAt(0))) { + // why not key off token type here assuming ChineseTokenizer comes first? + if (stopTable.get(text) == null) { + switch (Character.getType(text.charAt(0))) { - case Character.LOWERCASE_LETTER: - case Character.UPPERCASE_LETTER: + case Character.LOWERCASE_LETTER: + case Character.UPPERCASE_LETTER: - // English word/token should larger than 1 character. - if (text.length()>1) { - return token; - } - break; - case Character.OTHER_LETTER: + // English word/token should larger than 1 character. + if (text.length()>1) { + return token; + } + break; + case Character.OTHER_LETTER: - // One Chinese character as one Chinese word. - // Chinese word extraction to be added later here. + // One Chinese character as one Chinese word. + // Chinese word extraction to be added later here. - return token; - } + return token; + } - } + } - } - return null; + } + return null; } } \ No newline at end of file Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy) @@ -22,8 +22,6 @@ import org.apache.lucene.analysis.TokenStream; import java.io.IOException; -import java.util.HashSet; -import java.util.Hashtable; import java.util.Set; /** @@ -33,10 +31,6 @@ */ public final class BrazilianStemFilter extends TokenFilter { - /** - * The actual token in the input stream. - */ - private Token token = null; private BrazilianStemmer stemmer = null; private Set exclusions = null; @@ -53,22 +47,28 @@ /** * @return Returns the next token in the stream, or null at EOS. */ - public final Token next() - throws IOException { - if ((token = input.next()) == null) { + public final Token next(Token token) throws IOException { + token = input.next(token); + + if (token == null) return null; - } - // Check the exclusiontable. - else if (exclusions != null && exclusions.contains(token.termText())) { - return token; - } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token. - if ((s != null) && !s.equals(token.termText())) { - return new Token(s, token.startOffset(), token.endOffset(), token.type()); + + String term = new String(token.termBuffer(), 0, token.termLength()); + + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) { + int termLength = s.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + term.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); } - return token; } + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy) @@ -115,15 +115,14 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); - if (token == null) { + token = input.next(token); + if (token == null) return null; - } ngram(token); if (ngrams.size() > 0) @@ -133,12 +132,12 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + int termLength = token.termLength(); + char[] termBuffer = token.termBuffer(); int gramSize = minGram; while (gramSize <= maxGram) { // if the remaining input is too short, we can't generate any n-grams - if (gramSize > inLen) { + if (gramSize > termLength) { return; } @@ -147,13 +146,13 @@ return; } - Token tok; - if (side == Side.FRONT) { - tok = new Token(inStr.substring(0, gramSize), 0, gramSize); - } - else { - tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : termLength - gramSize; + int end = start + gramSize; + Token tok = (Token) token.clone(); + tok.setStartOffset(start); + tok.setEndOffset(end); + tok.setTermBuffer(termBuffer, start, gramSize); ngrams.add(tok); gramSize++; } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy) @@ -63,12 +63,12 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -81,15 +81,16 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + char[] termBuffer = token.termBuffer(); + int termLength = token.termLength(); int gramSize = minGram; while (gramSize <= maxGram) { int pos = 0; // reset to beginning of string - while (pos+gramSize <= inLen) { // while there is input - String gram = inStr.substring(pos, pos+gramSize); - Token tok = new Token(gram, pos, pos+gramSize); -// tok.setPositionIncrement(pos); + while (pos+gramSize <= termLength) { // while there is input + Token tok = (Token) token.clone(); + tok.setStartOffset(pos); + tok.setEndOffset(pos+gramSize); + tok.setTermBuffer(termBuffer, pos, gramSize); ngrams.add(tok); pos++; } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy) @@ -34,10 +34,6 @@ */ public final class FrenchStemFilter extends TokenFilter { - /** - * The actual token in the input stream. - */ - private Token token = null; private FrenchStemmer stemmer = null; private Set exclusions = null; @@ -55,25 +51,31 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() - throws IOException { - if ( ( token = input.next() ) == null ) { - return null; - } - // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { - return token; - } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), token.endOffset(), token.type()); - } - return token; - } - } - /** + public final Token next(Token token) throws IOException { + token = input.next(token); + + if (token == null) + return null; + + String term = new String(token.termBuffer(), 0, token.termLength()); + + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) { + int termLength = s.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + s.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); + } + } + return token; + } + + /** * Set a alternative/custom FrenchStemmer for this filter. */ public void setStemmer( FrenchStemmer stemmer ) { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy) @@ -38,7 +38,7 @@ public class ElisionFilter extends TokenFilter { private Set articles = null; - private static String apostrophes = "'’"; + private static char[] apostrophes = {'\'', '’'}; public void setArticles(Set articles) { this.articles = new HashSet(); @@ -74,25 +74,35 @@ } /** - * Returns the next input Token whith termText() without elisioned start + * Returns the next input Token with term() without elisioned start */ - public Token next() throws IOException { - Token t = input.next(); - if (t == null) + public Token next(Token token) throws IOException { + token = input.next(token); + if (token == null) return null; - String text = t.termText(); - System.out.println(text); - int minPoz = -1; - int poz; - for (int i = 0; i < apostrophes.length(); i++) { - poz = text.indexOf(apostrophes.charAt(i)); - if (poz != -1) - minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz); + + char[] termBuffer = token.termBuffer(); + int termLength = token.termLength(); + + int minPoz = Integer.MAX_VALUE; + for (int i = 0; i < apostrophes.length; i++) { + char apos = apostrophes[i]; + // The equivalent of String.indexOf(ch) + for (int poz = 0; poz < termLength ; poz++) { + if (termBuffer[poz] == apos) { + minPoz = Math.min(poz, minPoz); + break; + } + } } - if (minPoz != -1 - && articles.contains(text.substring(0, minPoz).toLowerCase())) - text = text.substring(minPoz + 1); - return new Token(text, t.startOffset(), t.endOffset(), t.type()); + + // An apostrophe has been found. If the prefix is an article strip it off. + if (minPoz != Integer.MAX_VALUE + && articles.contains(new String(token.termBuffer(), 0, minPoz).toLowerCase())) { + token.setTermBuffer(token.termBuffer(), minPoz + 1, token.termLength() - (minPoz + 1)); + } + + return token; } } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy) @@ -35,10 +35,7 @@ * @author Edwin de Jonge */ public final class DutchStemFilter extends TokenFilter { - /** - * The actual token in the input stream. - */ - private Token token = null; + private DutchStemmer stemmer = null; private Set exclusions = null; @@ -66,23 +63,28 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public Token next() throws IOException { - if ((token = input.next()) == null) { + public final Token next(Token token) throws IOException { + token = input.next(token); + + if (token == null) return null; - } - // Check the exclusiontable - else if (exclusions != null && exclusions.contains(token.termText())) { - return token; - } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token - if (!s.equals(token.termText())) { - return new Token(s, token.startOffset(), - token.endOffset(), token.type()); + String term = new String(token.termBuffer(), 0, token.termLength()); + + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) { + int termLength = s.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + s.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); } - return token; } + return token; } /** Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 683629) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -40,31 +40,50 @@ breaker = BreakIterator.getWordInstance(new Locale("th")); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (thaiToken != null) { - String text = thaiToken.termText(); int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(start, end), - thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type()); + token.setTermBuffer(thaiToken.termBuffer(), start, end - start); + token.setStartOffset(thaiToken.startOffset()+start); + token.setEndOffset(thaiToken.endOffset()+end); + token.setType(thaiToken.type()); + token.setPayload(thaiToken.getPayload()); + token.setFlags(thaiToken.getFlags()); + return token; } thaiToken = null; } - Token tk = input.next(); - if (tk == null) { + + token = input.next(token); + if (token == null || token.termLength() == 0) { return null; } - String text = tk.termText(); + + String text = new String(token.termBuffer(), 0, token.termLength()); if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type()); + String lowerText = text.toLowerCase(); + int termLength = lowerText.length(); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < termLength) + termBuffer = token.resizeTermBuffer(termLength); + lowerText.getChars(0, termLength, termBuffer, 0); + token.setTermLength(termLength); + return token; } - thaiToken = tk; + + thaiToken = (Token) token.clone(); breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(0, end), - thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type()); + char[] termBuffer = token.termBuffer(); + if (termBuffer.length < end) + termBuffer = token.resizeTermBuffer(end); + text.getChars(0, end, termBuffer, 0); + token.setTermLength(end); + token.setEndOffset(token.startOffset() + end); + return token; } return null; }