Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. @@ -31,14 +32,26 @@ public class ArabicNormalizationFilter extends TokenFilter { protected ArabicNormalizer normalizer = null; - + private TermAttribute termAtt; + public ArabicNormalizationFilter(TokenStream input) { super(input); normalizer = new ArabicNormalizer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int oldlen = termAtt.termLength(); + int newlen = normalizer.normalize(termAtt.termBuffer(), oldlen); + if (oldlen != newlen) + termAtt.setTermLength(newlen); + return true; + } else { + return false; + } + } - public Token next(Token reusableToken) throws IOException { if ((reusableToken = input.next(reusableToken)) == null) { return null; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words.. @@ -31,14 +32,26 @@ public class ArabicStemFilter extends TokenFilter { protected ArabicStemmer stemmer = null; - + private TermAttribute termAtt; + public ArabicStemFilter(TokenStream input) { super(input); stemmer = new ArabicStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int oldlen = termAtt.termLength(); + int newlen = stemmer.stem(termAtt.termBuffer(), oldlen); + if (oldlen != newlen) + termAtt.setTermLength(newlen); + return true; + } else { + return false; + } + } - /** * @return Returns the next token in the stream, or null at EOS */ Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.HashSet; @@ -36,16 +37,34 @@ */ private BrazilianStemmer stemmer = null; private Set exclusions = null; + private TermAttribute termAtt; public BrazilianStemFilter(TokenStream in) { super(in); stemmer = new BrazilianStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public BrazilianStemFilter(TokenStream in, Set exclusiontable) { this(in); this.exclusions = exclusiontable; } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } + } /** * @return Returns the next token in the stream, or null at EOS. Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -19,7 +19,12 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import java.io.IOException; import java.io.Reader; @@ -76,6 +81,10 @@ * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" */ private boolean preIsTokened = false; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; //~ Constructors ----------------------------------------------------------- @@ -86,10 +95,157 @@ */ public CJKTokenizer(Reader in) { input = in; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } //~ Methods ---------------------------------------------------------------- + public boolean incrementToken() throws IOException { + /** how many character(s) has been stored in buffer */ + int length = 0; + + /** the position used to create Token */ + int start = offset; + + while (true) { + /** current character */ + char c; + + /** unicode block of current character for detail */ + Character.UnicodeBlock ub; + + offset++; + + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + } + + if (dataLen == -1) { + if (length > 0) { + if (preIsTokened == true) { + length = 0; + preIsTokened = false; + } + + break; + } else { + return false; + } + } else { + //get current character + c = ioBuffer[bufferIndex++]; + + //get the UnicodeBlock of the current character + ub = Character.UnicodeBlock.of(c); + } + + //if the current character is ASCII or Extend ASCII + if ((ub == Character.UnicodeBlock.BASIC_LATIN) + || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) + ) { + if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { + int i = (int) c; + if (i >= 65281 && i <= 65374) { + /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ + i = i - 65248; + c = (char) i; + } + } + + // if the current character is a letter or "_" "+" "#" + if (Character.isLetterOrDigit(c) + || ((c == '_') || (c == '+') || (c == '#')) + ) { + if (length == 0) { + // "javaC1C2C3C4linux"
+ // ^--: the current character begin to token the ASCII + // letter + start = offset - 1; + } else if (tokenType == "double") { + // "javaC1C2C3C4linux"
+ // ^--: the previous non-ASCII + // : the current character + offset--; + bufferIndex--; + tokenType = "single"; + + if (preIsTokened == true) { + // there is only one non-ASCII has been stored + length = 0; + preIsTokened = false; + + break; + } else { + break; + } + } + + // store the LowerCase(c) in the buffer + buffer[length++] = Character.toLowerCase(c); + tokenType = "single"; + + // break the procedure if buffer overflowed! + if (length == MAX_WORD_LEN) { + break; + } + } else if (length > 0) { + if (preIsTokened == true) { + length = 0; + preIsTokened = false; + } else { + break; + } + } + } else { + // non-ASCII letter, e.g."C1C2C3C4" + if (Character.isLetter(c)) { + if (length == 0) { + start = offset - 1; + buffer[length++] = c; + tokenType = "double"; + } else { + if (tokenType == "single") { + offset--; + bufferIndex--; + + //return the previous ASCII characters + break; + } else { + buffer[length++] = c; + tokenType = "double"; + + if (length == 2) { + offset--; + bufferIndex--; + preIsTokened = true; + + break; + } + } + } + } else if (length > 0) { + if (preIsTokened == true) { + // empty the buffer + length = 0; + preIsTokened = false; + } else { + break; + } + } + } + } + + clearAttributes(); + termAtt.setTermBuffer(buffer, 0, length); + offsetAtt.setOffset(start, start+length); + typeAtt.setType(tokenType); + + return true; + } + /** * Returns the next token in the stream, or null at EOS. * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html Index: contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.Set; @@ -39,11 +40,14 @@ */ private GermanStemmer stemmer = null; private Set exclusionSet = null; + + private TermAttribute termAtt; public GermanStemFilter( TokenStream in ) { super(in); stemmer = new GermanStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -54,6 +58,22 @@ this( in ); this.exclusionSet = exclusionSet; } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusionSet == null || !exclusionSet.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } + } /** * @return Returns the next token in the stream, or null at EOS Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -16,9 +16,12 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("greek") charset. @@ -27,12 +30,28 @@ public final class GreekLowerCaseFilter extends TokenFilter { char[] charset; + private TermAttribute termAtt; public GreekLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); + for (int i = 0; i < chLen; i++) + { + chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); + } + return true; + } else { + return false; + } + } public final Token next(final Token reusableToken) throws java.io.IOException { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy) @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Removes elisions from a token stream. For example, "l'avion" (the plane) will be @@ -38,6 +39,8 @@ private Set articles = null; private static char[] apostrophes = {'\'', '’'}; + + private TermAttribute termAtt; public void setArticles(Set articles) { this.articles = new HashSet(); @@ -54,6 +57,7 @@ super(input); this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t", "qu", "n", "s", "j" })); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -62,6 +66,7 @@ public ElisionFilter(TokenStream input, Set articles) { super(input); setArticles(articles); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -70,8 +75,38 @@ public ElisionFilter(TokenStream input, String[] articles) { super(input); setArticles(new HashSet(Arrays.asList(articles))); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + int termLength = termAtt.termLength(); + int minPoz = Integer.MAX_VALUE; + for (int i = 0; i < apostrophes.length; i++) { + char apos = apostrophes[i]; + // The equivalent of String.indexOf(ch) + for (int poz = 0; poz < termLength ; poz++) { + if (termBuffer[poz] == apos) { + minPoz = Math.min(poz, minPoz); + break; + } + } + } + + // An apostrophe has been found. If the prefix is an article strip it off. + if (minPoz != Integer.MAX_VALUE + && articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) { + termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1)); + } + + return true; + } else { + return false; + } + } + /** * Returns the next input Token with term() without elisioned start */ Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.HashSet; @@ -39,10 +40,13 @@ */ private FrenchStemmer stemmer = null; private Set exclusions = null; + + private TermAttribute termAtt; public FrenchStemFilter( TokenStream in ) { super(in); stemmer = new FrenchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } @@ -50,7 +54,24 @@ this( in ); exclusions = exclusiontable; } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table + if ( exclusions == null || !exclusions.contains( term ) ) { + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals( term ) ) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } + } + /** * @return Returns the next token in the stream, or null at EOS */ Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy) @@ -27,8 +27,13 @@ */ public class EmptyTokenStream extends TokenStream { + public boolean incrementToken() throws IOException { + return false; + } + public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; return null; } + } Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.HashMap; @@ -39,10 +40,12 @@ */ private DutchStemmer stemmer = null; private Set exclusions = null; + private TermAttribute termAtt; public DutchStemFilter(TokenStream _in) { super(_in); stemmer = new DutchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -61,6 +64,23 @@ stemmer.setStemDictionary(stemdictionary); } + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } + } + /** * @return Returns the next token in the stream, or null at EOS */ Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -33,13 +35,27 @@ private String typeMatch; private Payload thePayload; + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { super(input); //Need to encode the payload thePayload = new Payload(PayloadHelper.encodeFloat(payload)); this.typeMatch = typeMatch; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (typeAtt.type().equals(typeMatch)) + payloadAtt.setPayload(thePayload); + return true; + } else { + return false; + } + } public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -32,11 +34,27 @@ * **/ public class TokenOffsetPayloadTokenFilter extends TokenFilter { + private PayloadAttribute payloadAtt; + private OffsetAttribute offsetAtt; - public TokenOffsetPayloadTokenFilter(TokenStream input) { super(input); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + byte[] data = new byte[8]; + PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0); + PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4); + Payload payload = new Payload(data); + payloadAtt.setPayload(payload); + return true; + } else { + return false; + } + } public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -32,13 +34,26 @@ * **/ public class TypeAsPayloadTokenFilter extends TokenFilter { - + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; + public TypeAsPayloadTokenFilter(TokenStream input) { super(input); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + } + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (typeAtt.type() != null && typeAtt.type().equals("") == false) { + payloadAtt.setPayload(new Payload(typeAtt.type().getBytes("UTF-8"))); + } + return true; + } else { + return false; + } } - - + public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); Index: contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** Set the positionIncrement of all tokens to the "positionIncrement", * except the first return token which retains its original positionIncrement value. @@ -34,6 +35,8 @@ /** The first token must have non-zero positionIncrement **/ private boolean firstTokenPositioned = false; + + private PositionIncrementAttribute posIncrAtt; /** * Constructs a PositionFilter that assigns a position increment of zero to @@ -43,6 +46,7 @@ */ public PositionFilter(final TokenStream input) { super(input); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } /** @@ -57,6 +61,19 @@ this(input); this.positionIncrement = positionIncrement; } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (firstTokenPositioned) { + posIncrAtt.setPositionIncrement(positionIncrement); + } else { + firstTokenPositioned = true; + } + return true; + } else { + return false; + } + } public Token next(Token reusableToken) throws IOException { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; @@ -30,9 +31,21 @@ */ public final class ReverseStringFilter extends TokenFilter { + private TermAttribute termAtt; + public ReverseStringFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + reverse( termAtt.termBuffer(), termAtt.termLength() ); + return true; + } else { + return false; + } + } public final Token next(Token in) throws IOException { assert in != null; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy) @@ -17,9 +17,12 @@ * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("russian") charset. @@ -30,12 +33,28 @@ public final class RussianLowerCaseFilter extends TokenFilter { char[] charset; - + private TermAttribute termAtt; + public RussianLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); + for (int i = 0; i < chLen; i++) + { + chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); + } + return true; + } else { + return false; + } + } public final Token next(final Token reusableToken) throws java.io.IOException { Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy) @@ -20,6 +20,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.io.IOException; /** @@ -36,12 +38,27 @@ * The actual token in the input stream. */ private RussianStemmer stemmer = null; + + private TermAttribute termAtt; public RussianStemFilter(TokenStream in, char[] charset) { super(in); stemmer = new RussianStemmer(charset); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + String s = stemmer.stem(term); + if (s != null && !s.equals(term)) + termAtt.setTermBuffer(s); + return true; + } else { + return false; + } + } /** * @return Returns the next token in the stream, or null at EOS Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 776655) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -22,6 +22,9 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.text.BreakIterator; /** @@ -34,11 +37,52 @@ private BreakIterator breaker = null; private Token thaiToken = null; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + + private TermAttribute thaiTermAtt = null; + private OffsetAttribute thaiOffsetAtt = null; + public ThaiWordFilter(TokenStream input) { super(input); breaker = BreakIterator.getWordInstance(new Locale("th")); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } + public boolean incrementToken() throws IOException { + if (thaiTermAtt != null) { + int start = breaker.current(); + int end = breaker.next(); + if (end != BreakIterator.DONE) { + termAtt.setTermBuffer(thaiTermAtt.termBuffer(), start, end - start); + offsetAtt.setOffset(thaiOffsetAtt.startOffset() + start, thaiOffsetAtt.endOffset() + end); + return true; + } + thaiTermAtt = null; + } + + if (input.incrementToken() == false || termAtt.termLength() == 0) + return false; + + String text = termAtt.term(); + if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { + termAtt.setTermBuffer(text.toLowerCase()); + return true; + } + + thaiTermAtt = (TermAttribute) termAtt.clone(); + thaiOffsetAtt = (OffsetAttribute) offsetAtt.clone(); + breaker.setText(text); + int end = breaker.next(); + if (end != BreakIterator.DONE) { + termAtt.setTermBuffer(text, 0, end); + offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); + return true; + } + return false; +} + public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (thaiToken != null) { Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 776655) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy) @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -95,11 +96,10 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 776655) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -118,11 +119,10 @@ private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicStemFilter filter = new ArabicStemFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 776655) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the German stemmer. The stemming algorithm is known to work less @@ -68,11 +69,9 @@ private void check(final String input, final String expected) throws IOException { StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); GermanStemFilter filter = new GermanStemFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } Index: contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 776655) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import junit.framework.TestCase; @@ -41,13 +42,12 @@ */ private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i=0; i termBuffer.length) { + termAtt.resizeTermBuffer(encodedLength); + } + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); + IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; + } + } public final Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); Index: contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java =================================================================== --- contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 776655) +++ contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; @@ -64,6 +65,7 @@ public class ICUCollationKeyFilter extends TokenFilter { private Collator collator = null; private RawCollationKey reusableKey = new RawCollationKey(); + private TermAttribute termAtt; /** * @@ -73,7 +75,28 @@ public ICUCollationKeyFilter(TokenStream input, Collator collator) { super(input); this.collator = collator; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + String termText = new String(termBuffer, 0, termAtt.termLength()); + collator.getRawCollationKey(termText, reusableKey); + ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); + int encodedLength + = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); + if (encodedLength > termBuffer.length) { + termAtt.resizeTermBuffer(encodedLength); + } + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); + IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; + } + } public final Token next(final Token reusableToken) throws IOException { assert reusableToken != null;