Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (working copy) @@ -33,12 +33,10 @@ { String s = "a天b"; ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s)); - Token token; int correctStartOffset = 0; int correctEndOffset = 1; - while ((token = tokenizer.next()) != null) - { + for (Token token = tokenizer.next(new Token()); token != null; token = tokenizer.next(token)) { assertEquals(correctStartOffset, token.startOffset()); assertEquals(correctEndOffset, token.endOffset()); correctStartOffset++; Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy) @@ -42,12 +42,13 @@ */ private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + Token token = new Token(); for (int i=0; i tokens; - // test a plain old token stream with synonyms tranlated to rows. + // test a plain old token stream with synonyms translated to rows. tokens = new LinkedList(); - tokens.add(new Token("please", 0, 6)); - tokens.add(new Token("divide", 7, 13)); - tokens.add(new Token("this", 14, 18)); - tokens.add(new Token("sentence", 19, 27)); - tokens.add(new Token("into", 28, 32)); - tokens.add(new Token("shingles", 33, 39)); + tokens.add(createToken("please", 0, 6)); + tokens.add(createToken("divide", 7, 13)); + tokens.add(createToken("this", 14, 18)); + tokens.add(createToken("sentence", 19, 27)); + tokens.add(createToken("into", 28, 32)); + tokens.add(createToken("shingles", 33, 39)); tls = new TokenListStream(tokens); @@ -70,21 +64,23 @@ ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); - assertNext(ts, "please", 0, 6); - assertNext(ts, "please divide", 0, 13); - assertNext(ts, "divide", 7, 13); - assertNext(ts, "divide this", 7, 18); - assertNext(ts, "this", 14, 18); - assertNext(ts, "this sentence", 14, 27); - assertNext(ts, "sentence", 19, 27); - assertNext(ts, "sentence into", 19, 32); - assertNext(ts, "into", 28, 32); - assertNext(ts, "into shingles", 28, 39); - assertNext(ts, "shingles", 33, 39); + Token token = new Token(); + assertNext(ts, token, "please", 0, 6); + assertNext(ts, token, "please divide", 0, 13); + assertNext(ts, token, "divide", 7, 13); + assertNext(ts, token, "divide this", 7, 18); + assertNext(ts, token, "this", 14, 18); + assertNext(ts, token, "this sentence", 14, 27); + assertNext(ts, token, "sentence", 19, 27); + assertNext(ts, token, "sentence into", 19, 32); + assertNext(ts, token, "into", 28, 32); + assertNext(ts, token, "into shingles", 28, 39); + assertNext(ts, token, "shingles", 33, 39); - assertNull(ts.next()); + assertNull(ts.next(token)); + } /** @@ -95,9 +91,6 @@ ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); - Token token = new Token(); // for debug use only - - TokenStream ts; TokenListStream tls; LinkedList tokens; @@ -117,25 +110,27 @@ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "hello_world"); - assertNext(ts, "greetings_world"); - assertNext(ts, "hello_earth"); - assertNext(ts, "greetings_earth"); - assertNext(ts, "hello_tellus"); - assertNext(ts, "greetings_tellus"); - assertNull(ts.next()); + Token token = new Token(); + assertNext(ts, token, "hello_world"); + assertNext(ts, token, "greetings_world"); + assertNext(ts, token, "hello_earth"); + assertNext(ts, token, "greetings_earth"); + assertNext(ts, token, "hello_tellus"); + assertNext(ts, token, "greetings_tellus"); + assertNull(ts.next(token)); // bi-grams with no spacer character, start offset, end offset tls.reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); - assertNext(ts, "helloworld", 0, 10); - assertNext(ts, "greetingsworld", 0, 10); - assertNext(ts, "helloearth", 0, 10); - assertNext(ts, "greetingsearth", 0, 10); - assertNext(ts, "hellotellus", 0, 10); - assertNext(ts, "greetingstellus", 0, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "helloworld", 0, 10); + assertNext(ts, token, "greetingsworld", 0, 10); + assertNext(ts, token, "helloearth", 0, 10); + assertNext(ts, token, "greetingsearth", 0, 10); + assertNext(ts, token, "hellotellus", 0, 10); + assertNext(ts, token, "greetingstellus", 0, 10); + assertNull(ts.next(token)); // add ^_prefix_and_suffix_$ @@ -160,119 +155,122 @@ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); // -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNull(ts.next()); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNull(ts.next(token)); // test unlimited size and allow single boundary token as shingle tls.reset(); ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false); // -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^", 1, 10.0f, 0, 0); - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "$", 1, 7.071068f, 10, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + token = new Token(); + assertNext(ts, token, "^", 1, 10.0f, 0, 0); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "$", 1, 7.071068f, 10, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); // test unlimited size but don't allow single boundary token as shingle tls.reset(); ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "^_hello", 1, 10.049875f, 0, 4); - assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello", 1, 1.0f, 0, 4); - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "world", 1, 1.0f, 5, 10); - assertNext(ts, "world_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); - assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings", 1, 1.0f, 0, 4); - assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "earth", 1, 1.0f, 5, 10); - assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); - assertNext(ts, "tellus", 1, 1.0f, 5, 10); - assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); - assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); - assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); - assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); + token = new Token(); + assertNext(ts, token, "^_hello", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_hello_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello", 1, 1.0f, 0, 4); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "world", 1, 1.0f, 5, 10); + assertNext(ts, token, "world_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings", 1, 10.049875f, 0, 4); + assertNext(ts, token, "^_greetings_world", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_world_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings", 1, 1.0f, 0, 4); + assertNext(ts, token, "greetings_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_world_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "earth", 1, 1.0f, 5, 10); + assertNext(ts, token, "earth_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_earth", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_earth_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_earth_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "^_hello_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_hello_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus_$", 1, 7.2111025f, 0, 10); + assertNext(ts, token, "tellus", 1, 1.0f, 5, 10); + assertNext(ts, token, "tellus_$", 1, 7.1414285f, 5, 10); + assertNext(ts, token, "^_greetings_tellus", 1, 10.099504f, 0, 10); + assertNext(ts, token, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); + assertNext(ts, token, "greetings_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_tellus_$", 1, 7.2111025f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); System.currentTimeMillis(); @@ -300,27 +298,28 @@ ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } // shingle, position increment, weight, start offset, end offset - assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); - assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); - assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); - assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); - assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); - assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); - assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); + token = new Token(); + assertNext(ts, token, "hello_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "greetings_and", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "greetings_and_salutations", 1, 1.7320508f, 0, 4); + assertNext(ts, token, "and_salutations", 1, 1.4142135f, 0, 4); + assertNext(ts, token, "and_salutations_world", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_world", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_earth", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_earth", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "hello_tellus", 1, 1.4142135f, 0, 10); + assertNext(ts, token, "and_salutations_tellus", 1, 1.7320508f, 0, 10); + assertNext(ts, token, "salutations_tellus", 1, 1.4142135f, 0, 10); - assertNull(ts.next()); + assertNull(ts.next(token)); System.currentTimeMillis(); @@ -361,53 +360,53 @@ TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec()); -// Token token = new Token(); -// while ((token = ts.next(token)) != null) { -// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); +// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { +// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } - assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0); - assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0); - assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0); - assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0); - assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0); - assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0); - assertNext(ts, "to_see", 1, 1.4142135f, 0, 0); - assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0); - assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0); - assertNext(ts, "see_england", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0); - assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0); - assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0); - assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0); - assertNext(ts, "in_the", 1, 1.4142135f, 0, 0); - assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0); - assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0); - assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0); - assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0); - assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0); - assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0); - assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0); - assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); - assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0); - assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); - assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); - assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0); - assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0); - assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0); - assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0); - assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0); - assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0); + Token token = new Token(); + assertNext(ts, token, "no_surprise", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "no_surprise_to", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "no_surprise_to_see", 1, 2.0f, 0, 0); + assertNext(ts, token, "surprise_to", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "surprise_to_see", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "surprise_to_see_england", 1, 2.0f, 0, 0); + assertNext(ts, token, "to_see", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "to_see_england", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "to_see_england_manager", 1, 2.0f, 0, 0); + assertNext(ts, token, "see_england", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "see_england_manager_svennis", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "england_manager_svennis", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_svennis_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_svennis", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_svennis_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_svennis_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "svennis_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "svennis_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "svennis_in_the_croud", 1, 2.0f, 0, 0); + assertNext(ts, token, "in_the", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "in_the_croud", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "the_croud", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "see_england_manager_sven", 1, 2.0f, 0, 0); + assertNext(ts, token, "england_manager_sven", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "england_manager_sven_göran", 1, 2.0f, 0, 0); + assertNext(ts, token, "manager_sven", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "manager_sven_göran", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); + assertNext(ts, token, "sven_göran", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); + assertNext(ts, token, "göran_eriksson", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "göran_eriksson_in", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "göran_eriksson_in_the", 1, 2.0f, 0, 0); + assertNext(ts, token, "eriksson_in", 1, 1.4142135f, 0, 0); + assertNext(ts, token, "eriksson_in_the", 1, 1.7320508f, 0, 0); + assertNext(ts, token, "eriksson_in_the_croud", 1, 2.0f, 0, 0); - assertNull(ts.next()); + assertNull(ts.next(token)); } @@ -417,11 +416,9 @@ private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); return token; } @@ -435,48 +432,44 @@ } private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); return token; } private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) { - Token token = new Token(); - token.setTermText(text); + Token token = new Token(startOffset, endOffset); + token.setTermBuffer(text); token.setPositionIncrement(posIncr); ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight); - token.setStartOffset(startOffset); - token.setEndOffset(endOffset); ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner); return token; } // assert-methods start here - private Token assertNext(TokenStream ts, String text) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text) throws IOException { + ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost) throws IOException { + token = ts.next(new Token()); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(positionIncrement, token.getPositionIncrement()); assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); return token; } - private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(positionIncrement, token.getPositionIncrement()); assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData())); assertEquals(startOffset, token.startOffset()); @@ -484,25 +477,31 @@ return token; } - private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException { - Token token = ts.next(new Token()); + private Token assertNext(TokenStream ts, Token token, String text, int startOffset, int endOffset) throws IOException { + token = ts.next(token); assertNotNull(token); - assertEquals(text, new String(token.termBuffer(), 0, token.termLength())); + assertEquals(text, token.term()); assertEquals(startOffset, token.startOffset()); assertEquals(endOffset, token.endOffset()); return token; } + private static Token createToken(String term, int start, int offset) + { + Token token = new Token(start, offset); + token.setTermBuffer(term); + return token; + } + public static class TokenListStream extends TokenStream { private Collection tokens; public TokenListStream(TokenStream ts) throws IOException { tokens = new ArrayList(); - Token token; - while ((token = ts.next(new Token())) != null) { - tokens.add(token); + for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { + tokens.add((Token) token.clone()); } } @@ -512,14 +511,15 @@ private Iterator iterator; - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (iterator == null) { iterator = tokens.iterator(); } if (!iterator.hasNext()) { return null; } - return iterator.next(); + token = (Token) iterator.next(); + return (Token) token.clone(); } Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (working copy) @@ -36,13 +36,13 @@ throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - + Token token = new Token(); for (int i = 0; i < output.length; i++) { - Token t = ts.next(); - assertNotNull(t); - assertEquals(t.termText(), output[i]); + token = ts.next(token); + assertNotNull(token); + assertEquals(token.term(), output[i]); } - assertNull(ts.next()); + assertNull(ts.next(token)); ts.close(); } Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (working copy) @@ -43,10 +43,9 @@ String test = "The quick red fox jumped over the lazy brown dogs"; NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D"); - Token tok = new Token(); boolean seenDogs = false; - while ((tok = nptf.next(tok)) != null){ - if (tok.termText().equals("dogs")){ + for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) { + if (tok.term().equals("dogs")){ seenDogs = true; assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true); assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); @@ -69,7 +68,7 @@ public Token next(Token result) throws IOException { result = input.next(result); - if (result != null && result.termText().equals("dogs")) { + if (result != null && result.term().equals("dogs")) { result.setType("D"); } return result; Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (working copy) @@ -44,9 +44,8 @@ String test = "The quick red fox jumped over the lazy brown dogs"; TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test)))); - Token tok = new Token(); int count = 0; - while ((tok = nptf.next(tok)) != null){ + for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) { assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0])))); assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); String type = new String(tok.getPayload().getData(), "UTF-8"); Index: lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java =================================================================== --- lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (revision 682416) +++ lucene/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (working copy) @@ -42,9 +42,8 @@ String test = "The quick red fox jumped over the lazy brown dogs"; TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test))); - Token tok = new Token(); int count = 0; - while ((tok = nptf.next(tok)) != null){ + for (Token tok = nptf.next(new Token()); tok != null; tok = nptf.next(tok)) { assertTrue("tok is null and it shouldn't be", tok != null); Payload pay = tok.getPayload(); assertTrue("pay is null and it shouldn't be", pay != null); Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (working copy) @@ -105,12 +105,12 @@ return dict; } - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (tokens.size() > 0) { return (Token)tokens.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -155,7 +155,7 @@ protected void decompose(final Token token) { // In any case we give the original token back - tokens.add(token); + tokens.add((Token) token.clone()); // Only words longer than minWordSize get processed if (token.termLength() < this.minWordSize) { Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -37,25 +37,19 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -35,7 +35,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private RussianStemmer stemmer = null; public RussianStemFilter(TokenStream in, char[] charset) @@ -47,22 +46,20 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() throws IOException + public final Token next(Token token) throws IOException { - if ((token = input.next()) == null) + if ((token = input.next(token)) == null) { return null; } - else + String term = token.term(); + String s = stemmer.stem(term); + if (!s.equals(term)) { - String s = stemmer.stem(token.termText()); - if (!s.equals(token.termText())) - { - return new Token(s, token.startOffset(), token.endOffset(), - token.type()); - } - return token; + token.clear(); + token.setTermBuffer(s); } + return token; } /** Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java (working copy) @@ -48,7 +48,7 @@ public void add(Token t) { //check to see if this is a Category if (t != null && typeToMatch.equals(t.type())){ - lst.add(t.clone()); + super.add(t); } } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java (working copy) @@ -73,10 +73,10 @@ //Check to see if this token is a date if (t != null) { try { - Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date + Date date = dateFormat.parse(t.term());//We don't care about the date, just that we can parse it as a date if (date != null) { t.setType(DATE_TYPE); - lst.add(t.clone()); + super.add(t); } } catch (ParseException e) { Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -37,7 +37,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private GermanStemmer stemmer = null; private Set exclusionSet = null; @@ -59,22 +58,23 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() + public final Token next(Token token) throws IOException { - if ( ( token = input.next() ) == null ) { + if ( ( token = input.next(token) ) == null ) { return null; } - // Check the exclusiontable - else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + String term = token.term(); + // Check the exclusion table + if ( exclusionSet != null && exclusionSet.contains( term ) ) { return token; } else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time creating a new token + if ( !s.equals( term ) ) { + token.clear(); + token.setTermBuffer(s); } return token; } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) @@ -150,11 +150,11 @@ } /* (non-Javadoc) - * @see org.apache.lucene.analysis.TokenStream#next() - */ - public Token next() throws IOException { + * @see org.apache.lucene.analysis.TokenStream#next() + */ + public Token next(Token token) throws IOException { if (outputBuf.isEmpty()) { - fillOutputBuf(); + fillOutputBuf(token); } Token nextToken = null; if ( ! outputBuf.isEmpty()) @@ -173,16 +173,18 @@ * @return the next token, or null if at end of input stream * @throws IOException if the input stream has a problem */ - private Token getNextToken() throws IOException { + private Token getNextToken(Token token) throws IOException { if (tokenBuf.isEmpty()) { - Token lastToken = input.next(); + Token lastToken = token; + lastToken = input.next(lastToken); if (lastToken != null) { + Token fillerToken = new Token(lastToken.startOffset(), lastToken.startOffset()); + fillerToken.setTermBuffer(FILLER_TOKEN); for (int i = 1; i < lastToken.getPositionIncrement(); i++) { - tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(), - lastToken.startOffset())); + tokenBuf.add(fillerToken.clone()); } - tokenBuf.add(lastToken); - return getNextToken(); + tokenBuf.add(lastToken.clone()); + return getNextToken(lastToken); } else { return null; } @@ -196,15 +198,15 @@ * * @throws IOException if there's a problem getting the next token */ - private void fillOutputBuf() throws IOException { + private void fillOutputBuf(Token token) throws IOException { boolean addedToken = false; /* * Try to fill the shingle buffer. */ do { - Token token = getNextToken(); + token = getNextToken(token); if (token != null) { - shingleBuf.add(token); + shingleBuf.add(token.clone()); if (shingleBuf.size() > maxShingleSize) { shingleBuf.remove(0); @@ -235,17 +237,17 @@ } int i = 0; - Token token = null; + Token shingle = null; for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) { - token = (Token) it.next(); + shingle = (Token) it.next(); for (int j = i; j < shingles.length; j++) { if (shingles[j].length() != 0) { shingles[j].append(TOKEN_SEPARATOR); } - shingles[j].append(token.termBuffer(), 0, token.termLength()); + shingles[j].append(shingle.termBuffer(), 0, shingle.termLength()); } - endOffsets[i] = token.endOffset(); + endOffsets[i] = shingle.endOffset(); i++; } @@ -258,17 +260,19 @@ /* * Push new tokens to the output buffer. */ + shingle = new Token(); + shingle.setType(tokenType); for (int j = 1; j < shingleBuf.size(); j++) { - Token shingle = new Token(shingles[j].toString(), - ((Token) shingleBuf.get(0)).startOffset(), - endOffsets[j], - tokenType); + shingle.clear(); + shingle.setStartOffset(((Token) shingleBuf.get(0)).startOffset()); + shingle.setEndOffset(endOffsets[j]); + shingle.setTermBuffer(shingles[j].toString()); if ((! outputUnigrams) && j == 1) { shingle.setPositionIncrement(1); } else { shingle.setPositionIncrement(0); } - outputBuf.add(shingle); + outputBuf.add(shingle.clone()); } } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (working copy) @@ -17,16 +17,23 @@ * limitations under the License. */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.index.Payload; -import java.io.IOException; -import java.util.*; - /** *

A ShingleFilter constructs shingles (token n-grams) from a token stream. * In other words, it creates combinations of tokens as a single token. @@ -340,14 +347,14 @@ } // shingle token factory - StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future. + StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. for (Token shingleToken : shingle) { if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength()); } - token.setTermText(sb.toString()); + token.setTermBuffer(sb.toString()); updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); return token; Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -35,25 +35,19 @@ this.charset = charset; } - public final Token next() throws java.io.IOException + public final Token next(Token token) throws java.io.IOException { - Token t = input.next(); + token = input.next(token); - if (t == null) + if (token == null) return null; - String txt = t.termText(); - - char[] chArray = txt.toCharArray(); - for (int i = 0; i < chArray.length; i++) + char[] chArray = token.termBuffer(); + int chLen = token.termLength(); + for (int i = 0; i < chLen; i++) { chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); } - - String newTxt = new String(chArray); - // create new token - Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); - - return newToken; + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java (working copy) @@ -18,8 +18,11 @@ */ import java.util.Hashtable; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + /** * Title: ChineseFilter * Description: Filter with a stop word table @@ -61,10 +64,10 @@ stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); } - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { - for (Token token = input.next(); token != null; token = input.next()) { - String text = token.termText(); + for (token = input.next(token); token != null; token = input.next(token)) { + String text = token.term(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable.get(text) == null) { Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -19,9 +19,11 @@ import java.io.Reader; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + /** * Title: ChineseTokenizer * Description: Extract tokens from the Stream using Character.getType() @@ -75,17 +77,19 @@ } - private final Token flush() { + private final Token flush(Token token) { if (length>0) { //System.out.println(new String(buffer, 0, length)); - return new Token(new String(buffer, 0, length), start, start+length); + token.clear(); + token.setTermBuffer(buffer, 0, length); + return token; } else return null; } - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { length = 0; start = offset; @@ -101,7 +105,7 @@ bufferIndex = 0; } - if (dataLen == -1) return flush(); + if (dataLen == -1) return flush(token); else c = ioBuffer[bufferIndex++]; @@ -112,20 +116,20 @@ case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); - if (length == MAX_WORD_LEN) return flush(); + if (length == MAX_WORD_LEN) return flush(token); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; - return flush(); + return flush(token); } push(c); - return flush(); + return flush(token); default: - if (length>0) return flush(); + if (length>0) return flush(token); break; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (working copy) @@ -28,11 +28,12 @@ public class SingleTokenTokenStream extends TokenStream { private boolean exhausted = false; + // The token needs to be immutable, so work with clones! private Token token; public SingleTokenTokenStream(Token token) { - this.token = token; + this.token = (Token) token.clone(); } @@ -41,7 +42,7 @@ return null; } exhausted = true; - return token; + return (Token) token.clone(); } @@ -50,10 +51,10 @@ } public Token getToken() { - return token; + return (Token) token.clone(); } public void setToken(Token token) { - this.token = token; + this.token = (Token) token.clone(); } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (working copy) @@ -124,7 +124,6 @@ if (source.termBuffer() != null) { setTermBuffer(source.termBuffer(), 0, source.termLength()); } else { - setTermText(null); setTermLength(0); } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy) @@ -27,18 +27,7 @@ */ public class EmptyTokenStream extends TokenStream { - public Token next() throws IOException { - return null; - } - public Token next(Token result) throws IOException { return null; } - - public void reset() throws IOException { - } - - public void close() throws IOException { - } - } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy) @@ -36,7 +36,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private BrazilianStemmer stemmer = null; private Set exclusions = null; @@ -53,19 +52,23 @@ /** * @return Returns the next token in the stream, or null at EOS. */ - public final Token next() + public final Token next(Token token) throws IOException { - if ((token = input.next()) == null) { + if ((token = input.next(token)) == null) { return null; } - // Check the exclusiontable. - else if (exclusions != null && exclusions.contains(token.termText())) { + + String term = token.term(); + + // Check the exclusion table. + if (exclusions != null && exclusions.contains(term)) { return token; } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token. - if ((s != null) && !s.equals(token.termText())) { - return new Token(s, token.startOffset(), token.endOffset(), token.type()); + String s = stemmer.stem(term); + // If not stemmed, don't waste the time creating a new token. + if ((s != null) && !s.equals(term)) { + token.clear(); + token.setTermBuffer(s); } return token; } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) @@ -64,7 +64,7 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { if (!started) { started = true; gramSize = minGram; @@ -82,9 +82,14 @@ if (pos+gramSize > inLen) return null; } - String gram = inStr.substring(pos, pos+gramSize); + int oldPos = pos; pos++; - return new Token(gram, oldPos, oldPos+gramSize); + token.clear(); + token.setTermBuffer(inStr, pos, gramSize); + token.setStartOffset(oldPos); + token.setEndOffset(oldPos+gramSize); + token.setType(Token.DEFAULT_TYPE); + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (working copy) @@ -115,12 +115,12 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -133,8 +133,8 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + char[] inStr = token.termBuffer(); + int inLen = token.termLength(); int gramSize = minGram; while (gramSize <= maxGram) { // if the remaining input is too short, we can't generate any n-grams @@ -147,13 +147,11 @@ return; } - Token tok; - if (side == Side.FRONT) { - tok = new Token(inStr.substring(0, gramSize), 0, gramSize); - } - else { - tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : inLen - gramSize; + int end = start + gramSize; + Token tok = new Token(start, end); + tok.setTermBuffer(inStr, start, gramSize); ngrams.add(tok); gramSize++; } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side; import java.io.IOException; import java.io.Reader; @@ -113,13 +114,13 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { // if we are just starting, read the whole input if (!started) { started = true; char[] chars = new char[1024]; input.read(chars); - inStr = new String(chars).trim(); // remove any trailing empty strings + inStr = new String(chars).trim(); // remove any leading or trailing spaces inLen = inStr.length(); gramSize = minGram; } @@ -134,15 +135,15 @@ return null; } - Token tok; - if (side == Side.FRONT) { - tok = new Token(inStr.substring(0, gramSize), 0, gramSize); - } - else { - tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen); - } - + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : inLen - gramSize; + int end = start + gramSize; + token.clear(); + token.setTermBuffer(inStr, start, gramSize); + token.setStartOffset(start); + token.setEndOffset(end); + token.setType(Token.DEFAULT_TYPE); gramSize++; - return tok; + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy) @@ -63,12 +63,12 @@ } /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { if (ngrams.size() > 0) { return (Token) ngrams.removeFirst(); } - Token token = input.next(); + token = input.next(token); if (token == null) { return null; } @@ -81,14 +81,14 @@ } private void ngram(Token token) { - String inStr = token.termText(); - int inLen = inStr.length(); + char[] inStr = token.termBuffer(); + int inLen = token.termLength(); int gramSize = minGram; while (gramSize <= maxGram) { int pos = 0; // reset to beginning of string while (pos+gramSize <= inLen) { // while there is input - String gram = inStr.substring(pos, pos+gramSize); - Token tok = new Token(gram, pos, pos+gramSize); + Token tok = new Token(pos, pos+gramSize); + tok.setTermBuffer(inStr, pos, gramSize); // tok.setPositionIncrement(pos); ngrams.add(tok); pos++; Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy) @@ -37,7 +37,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private FrenchStemmer stemmer = null; private Set exclusions = null; @@ -55,20 +54,22 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public final Token next() + public final Token next(Token token) throws IOException { - if ( ( token = input.next() ) == null ) { + if ( ( token = input.next(token) ) == null ) { return null; } - // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { + String term = token.term(); + // Check the exclusion table + if ( exclusions != null && exclusions.contains( term ) ) { return token; } else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), token.endOffset(), token.type()); + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time creating a new token + if ( !s.equals( term ) ) { + token.clear(); + token.setTermBuffer(s); } return token; } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy) @@ -74,14 +74,14 @@ } /** - * Returns the next input Token whith termText() without elisioned start + * Returns the next input Token with term() without elisioned start */ - public Token next() throws IOException { - Token t = input.next(); - if (t == null) + public Token next(Token token) throws IOException { + token = input.next(token); + if (token == null) return null; - String text = t.termText(); - System.out.println(text); + String text = token.term(); + //System.out.println(text); int minPoz = -1; int poz; for (int i = 0; i < apostrophes.length(); i++) { @@ -92,7 +92,10 @@ if (minPoz != -1 && articles.contains(text.substring(0, minPoz).toLowerCase())) text = text.substring(minPoz + 1); - return new Token(text, t.startOffset(), t.endOffset(), t.type()); + + token.clear(); + token.setTermBuffer(text); + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -26,7 +26,7 @@ /** * CJKTokenizer was modified from StopTokenizer which does a decent job for * most European languages. It performs other token methods for double-byte - * Characters: the token will return at each two charactors with overlap match.
+ * Characters: the token will return at each two characters with overlap match.
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it * also need filter filter zero length token ""
* for Digit: digit, '+', '#' will token as letter
@@ -96,13 +96,14 @@ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html * for detail. * + * @param token a reusable token * @return Token * * @throws java.io.IOException - throw IOException when read error
- * hanppened in the InputStream + * happened in the InputStream * */ - public final Token next() throws java.io.IOException { + public final Token next(Token token) throws java.io.IOException { /** how many character(s) has been stored in buffer */ int length = 0; @@ -110,10 +111,10 @@ int start = offset; while (true) { - /** current charactor */ + /** current character */ char c; - /** unicode block of current charactor for detail */ + /** unicode block of current character for detail */ Character.UnicodeBlock ub; offset++; @@ -198,7 +199,7 @@ } } } else { - // non-ASCII letter, eg."C1C2C3C4" + // non-ASCII letter, e.g."C1C2C3C4" if (Character.isLetter(c)) { if (length == 0) { start = offset - 1; @@ -236,8 +237,11 @@ } } - return new Token(new String(buffer, 0, length), start, start + length, - tokenType - ); + token.clear(); + token.setTermBuffer(buffer, 0, length); + token.setStartOffset(start); + token.setEndOffset(start + length); + token.setType(tokenType); + return token; } } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy) @@ -38,7 +38,6 @@ /** * The actual token in the input stream. */ - private Token token = null; private DutchStemmer stemmer = null; private Set exclusions = null; @@ -66,20 +65,20 @@ /** * @return Returns the next token in the stream, or null at EOS */ - public Token next() throws IOException { - if ((token = input.next()) == null) { + public Token next(Token token) throws IOException { + if ((token = input.next(token)) == null) { return null; } - - // Check the exclusiontable - else if (exclusions != null && exclusions.contains(token.termText())) { + String term = token.term(); + // Check the exclusion table + if (exclusions != null && exclusions.contains(term)) { return token; } else { - String s = stemmer.stem(token.termText()); - // If not stemmed, dont waste the time creating a new token - if (!s.equals(token.termText())) { - return new Token(s, token.startOffset(), - token.endOffset(), token.type()); + String s = stemmer.stem(term); + // If not stemmed, don't waste the time creating a new token + if (!s.equals(term)) { + token.clear(); + token.setTermBuffer(s); } return token; } Index: lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 682416) +++ lucene/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -40,31 +40,41 @@ breaker = BreakIterator.getWordInstance(new Locale("th")); } - public Token next() throws IOException { + public Token next(Token token) throws IOException { if (thaiToken != null) { - String text = thaiToken.termText(); int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(start, end), - thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type()); + token.clear(); + token.setTermBuffer(thaiToken.termBuffer(), start, end - start); + token.setStartOffset(thaiToken.startOffset()+start); + token.setEndOffset(thaiToken.endOffset()+end); + token.setType(thaiToken.type()); + return token; } thaiToken = null; } - Token tk = input.next(); - if (tk == null) { + + token = input.next(token); + if (token == null || token.termLength() == 0) { return null; } - String text = tk.termText(); + + String text = token.term(); if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type()); + token.clear(); + token.setTermBuffer(text.toLowerCase()); + return token; } - thaiToken = tk; + + thaiToken = (Token) token.clone(); breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - return new Token(text.substring(0, end), - thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type()); + token.clear(); + token.setTermBuffer(text, 0, end); + token.setEndOffset(token.startOffset() + end); + return token; } return null; }