Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
@@ -31,14 +32,26 @@
public class ArabicNormalizationFilter extends TokenFilter {
protected ArabicNormalizer normalizer = null;
-
+ private TermAttribute termAtt;
+
public ArabicNormalizationFilter(TokenStream input) {
super(input);
normalizer = new ArabicNormalizer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int oldlen = termAtt.termLength();
+ int newlen = normalizer.normalize(termAtt.termBuffer(), oldlen);
+ if (oldlen != newlen)
+ termAtt.setTermLength(newlen);
+ return true;
+ } else {
+ return false;
+ }
+ }
-
public Token next(Token reusableToken) throws IOException {
if ((reusableToken = input.next(reusableToken)) == null) {
return null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
@@ -31,14 +32,26 @@
public class ArabicStemFilter extends TokenFilter {
protected ArabicStemmer stemmer = null;
-
+ private TermAttribute termAtt;
+
public ArabicStemFilter(TokenStream input) {
super(input);
stemmer = new ArabicStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int oldlen = termAtt.termLength();
+ int newlen = stemmer.stem(termAtt.termBuffer(), oldlen);
+ if (oldlen != newlen)
+ termAtt.setTermLength(newlen);
+ return true;
+ } else {
+ return false;
+ }
+ }
-
/**
* @return Returns the next token in the stream, or null at EOS
*/
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.util.HashSet;
@@ -36,16 +37,34 @@
*/
private BrazilianStemmer stemmer = null;
private Set exclusions = null;
+ private TermAttribute termAtt;
public BrazilianStemFilter(TokenStream in) {
super(in);
stemmer = new BrazilianStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
this(in);
this.exclusions = exclusiontable;
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
/**
* @return Returns the next token in the stream, or null at EOS.
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy)
@@ -19,7 +19,12 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import java.io.IOException;
import java.io.Reader;
@@ -76,6 +81,10 @@
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
*/
private boolean preIsTokened = false;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
//~ Constructors -----------------------------------------------------------
@@ -86,10 +95,157 @@
*/
public CJKTokenizer(Reader in) {
input = in;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
//~ Methods ----------------------------------------------------------------
+ public boolean incrementToken() throws IOException {
+ /** how many character(s) has been stored in buffer */
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true) {
+ /** current character */
+ char c;
+
+ /** unicode block of current character for detail */
+ Character.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1) {
+ if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ }
+
+ break;
+ } else {
+ return false;
+ }
+ } else {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ ub = Character.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+ || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+ ) {
+ if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+ int i = (int) c;
+ if (i >= 65281 && i <= 65374) {
+ /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+ i = i - 65248;
+ c = (char) i;
+ }
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (Character.isLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ ) {
+ if (length == 0) {
+ // "javaC1C2C3C4linux"
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ } else if (tokenType == "double") {
+ // "javaC1C2C3C4linux"
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+ tokenType = "single";
+
+ if (preIsTokened == true) {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+
+ break;
+ } else {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = Character.toLowerCase(c);
+ tokenType = "single";
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN) {
+ break;
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ } else {
+ // non-ASCII letter, e.g."C1C2C3C4"
+ if (Character.isLetter(c)) {
+ if (length == 0) {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = "double";
+ } else {
+ if (tokenType == "single") {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ } else {
+ buffer[length++] = c;
+ tokenType = "double";
+
+ if (length == 2) {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ clearAttributes();
+ termAtt.setTermBuffer(buffer, 0, length);
+ offsetAtt.setOffset(start, start+length);
+ typeAtt.setType(tokenType);
+
+ return true;
+ }
+
/**
* Returns the next token in the stream, or null at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.util.Set;
@@ -39,11 +40,14 @@
*/
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
+
+ private TermAttribute termAtt;
public GermanStemFilter( TokenStream in )
{
super(in);
stemmer = new GermanStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -54,6 +58,22 @@
this( in );
this.exclusionSet = exclusionSet;
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ // Check the exclusion table.
+ if (exclusionSet == null || !exclusionSet.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
/**
* @return Returns the next token in the stream, or null at EOS
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy)
@@ -16,9 +16,12 @@
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case, analyzing given ("greek") charset.
@@ -27,12 +30,28 @@
public final class GreekLowerCaseFilter extends TokenFilter
{
char[] charset;
+ private TermAttribute termAtt;
public GreekLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] chArray = termAtt.termBuffer();
+ int chLen = termAtt.termLength();
+ for (int i = 0; i < chLen; i++)
+ {
+ chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
public final Token next(final Token reusableToken) throws java.io.IOException
{
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy)
@@ -25,6 +25,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Removes elisions from a token stream. For example, "l'avion" (the plane) will be
@@ -38,6 +39,8 @@
private Set articles = null;
private static char[] apostrophes = {'\'', '’'};
+
+ private TermAttribute termAtt;
public void setArticles(Set articles) {
this.articles = new HashSet();
@@ -54,6 +57,7 @@
super(input);
this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t",
"qu", "n", "s", "j" }));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -62,6 +66,7 @@
public ElisionFilter(TokenStream input, Set articles) {
super(input);
setArticles(articles);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -70,8 +75,38 @@
public ElisionFilter(TokenStream input, String[] articles) {
super(input);
setArticles(new HashSet(Arrays.asList(articles)));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ int termLength = termAtt.termLength();
+ int minPoz = Integer.MAX_VALUE;
+ for (int i = 0; i < apostrophes.length; i++) {
+ char apos = apostrophes[i];
+ // The equivalent of String.indexOf(ch)
+ for (int poz = 0; poz < termLength ; poz++) {
+ if (termBuffer[poz] == apos) {
+ minPoz = Math.min(poz, minPoz);
+ break;
+ }
+ }
+ }
+
+ // An apostrophe has been found. If the prefix is an article strip it off.
+ if (minPoz != Integer.MAX_VALUE
+ && articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) {
+ termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1));
+ }
+
+ return true;
+ } else {
+ return false;
+ }
+ }
+
/**
* Returns the next input Token with term() without elisioned start
*/
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.util.HashSet;
@@ -39,10 +40,13 @@
*/
private FrenchStemmer stemmer = null;
private Set exclusions = null;
+
+ private TermAttribute termAtt;
public FrenchStemFilter( TokenStream in ) {
super(in);
stemmer = new FrenchStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
@@ -50,7 +54,24 @@
this( in );
exclusions = exclusiontable;
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ // Check the exclusion table
+ if ( exclusions == null || !exclusions.contains( term ) ) {
+ String s = stemmer.stem( term );
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals( term ) )
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
/**
* @return Returns the next token in the stream, or null at EOS
*/
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy)
@@ -27,8 +27,13 @@
*/
public class EmptyTokenStream extends TokenStream {
+ public boolean incrementToken() throws IOException {
+ return false;
+ }
+
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return null;
}
+
}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.util.HashMap;
@@ -39,10 +40,12 @@
*/
private DutchStemmer stemmer = null;
private Set exclusions = null;
+ private TermAttribute termAtt;
public DutchStemFilter(TokenStream _in) {
super(_in);
stemmer = new DutchStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -61,6 +64,23 @@
stemmer.setStemDictionary(stemdictionary);
}
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
/**
* @return Returns the next token in the stream, or null at EOS
*/
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy)
@@ -20,6 +20,8 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -33,13 +35,27 @@
private String typeMatch;
private Payload thePayload;
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
super(input);
//Need to encode the payload
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
this.typeMatch = typeMatch;
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (typeAtt.type().equals(typeMatch))
+ payloadAtt.setPayload(thePayload);
+ return true;
+ } else {
+ return false;
+ }
+ }
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy)
@@ -20,6 +20,8 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -32,11 +34,27 @@
*
**/
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
+ private PayloadAttribute payloadAtt;
+ private OffsetAttribute offsetAtt;
-
public TokenOffsetPayloadTokenFilter(TokenStream input) {
super(input);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ byte[] data = new byte[8];
+ PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0);
+ PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4);
+ Payload payload = new Payload(data);
+ payloadAtt.setPayload(payload);
+ return true;
+ } else {
+ return false;
+ }
+ }
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy)
@@ -20,6 +20,8 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -32,13 +34,26 @@
*
**/
public class TypeAsPayloadTokenFilter extends TokenFilter {
-
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
+
public TypeAsPayloadTokenFilter(TokenStream input) {
super(input);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ }
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (typeAtt.type() != null && typeAtt.type().equals("") == false) {
+ payloadAtt.setPayload(new Payload(typeAtt.type().getBytes("UTF-8")));
+ }
+ return true;
+ } else {
+ return false;
+ }
}
-
-
+
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Set the positionIncrement of all tokens to the "positionIncrement",
* except the first return token which retains its original positionIncrement value.
@@ -34,6 +35,8 @@
/** The first token must have non-zero positionIncrement **/
private boolean firstTokenPositioned = false;
+
+ private PositionIncrementAttribute posIncrAtt;
/**
* Constructs a PositionFilter that assigns a position increment of zero to
@@ -43,6 +46,7 @@
*/
public PositionFilter(final TokenStream input) {
super(input);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
/**
@@ -57,6 +61,19 @@
this(input);
this.positionIncrement = positionIncrement;
}
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (firstTokenPositioned) {
+ posIncrAtt.setPositionIncrement(positionIncrement);
+ } else {
+ firstTokenPositioned = true;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
public Token next(Token reusableToken) throws IOException {
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
@@ -30,9 +31,21 @@
*/
public final class ReverseStringFilter extends TokenFilter {
+ private TermAttribute termAtt;
+
public ReverseStringFilter(TokenStream in) {
super(in);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ reverse( termAtt.termBuffer(), termAtt.termLength() );
+ return true;
+ } else {
+ return false;
+ }
+ }
public final Token next(Token in) throws IOException {
assert in != null;
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java (working copy)
@@ -17,9 +17,12 @@
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case, analyzing given ("russian") charset.
@@ -30,12 +33,28 @@
public final class RussianLowerCaseFilter extends TokenFilter
{
char[] charset;
-
+ private TermAttribute termAtt;
+
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] chArray = termAtt.termBuffer();
+ int chLen = termAtt.termLength();
+ for (int i = 0; i < chLen; i++)
+ {
+ chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
public final Token next(final Token reusableToken) throws java.io.IOException
{
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy)
@@ -20,6 +20,8 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import java.io.IOException;
/**
@@ -36,12 +38,27 @@
* The actual token in the input stream.
*/
private RussianStemmer stemmer = null;
+
+ private TermAttribute termAtt;
public RussianStemFilter(TokenStream in, char[] charset)
{
super(in);
stemmer = new RussianStemmer(charset);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ String s = stemmer.stem(term);
+ if (s != null && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ return true;
+ } else {
+ return false;
+ }
+ }
/**
* @return Returns the next token in the stream, or null at EOS
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 776655)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy)
@@ -22,6 +22,9 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import java.text.BreakIterator;
/**
@@ -34,11 +37,52 @@
private BreakIterator breaker = null;
private Token thaiToken = null;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ private TermAttribute thaiTermAtt = null;
+ private OffsetAttribute thaiOffsetAtt = null;
+
public ThaiWordFilter(TokenStream input) {
super(input);
breaker = BreakIterator.getWordInstance(new Locale("th"));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
+ public boolean incrementToken() throws IOException {
+ if (thaiTermAtt != null) {
+ int start = breaker.current();
+ int end = breaker.next();
+ if (end != BreakIterator.DONE) {
+ termAtt.setTermBuffer(thaiTermAtt.termBuffer(), start, end - start);
+ offsetAtt.setOffset(thaiOffsetAtt.startOffset() + start, thaiOffsetAtt.endOffset() + end);
+ return true;
+ }
+ thaiTermAtt = null;
+ }
+
+ if (input.incrementToken() == false || termAtt.termLength() == 0)
+ return false;
+
+ String text = termAtt.term();
+ if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
+ termAtt.setTermBuffer(text.toLowerCase());
+ return true;
+ }
+
+ thaiTermAtt = (TermAttribute) termAtt.clone();
+ thaiOffsetAtt = (OffsetAttribute) offsetAtt.clone();
+ breaker.setText(text);
+ int end = breaker.next();
+ if (end != BreakIterator.DONE) {
+ termAtt.setTermBuffer(text, 0, end);
+ offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end);
+ return true;
+ }
+ return false;
+}
+
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (thaiToken != null) {
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 776655)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy)
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the Arabic Normalization Filter
@@ -95,11 +96,10 @@
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+
+ assertTrue(filter.incrementToken());
+ assertEquals(expected, termAtt.term());
filter.close();
}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 776655)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the Arabic Normalization Filter
@@ -118,11 +119,10 @@
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+
+ assertTrue(filter.incrementToken());
+ assertEquals(expected, termAtt.term());
filter.close();
}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 776655)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the German stemmer. The stemming algorithm is known to work less
@@ -68,11 +69,9 @@
private void check(final String input, final String expected) throws IOException {
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
GermanStemFilter filter = new GermanStemFilter(tokenStream);
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+ assertTrue(filter.incrementToken());
+ assertEquals(expected, termAtt.term());
filter.close();
}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 776655)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy)
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import junit.framework.TestCase;
@@ -41,13 +42,12 @@
*/
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
for (int i=0; i termBuffer.length) {
+ termAtt.resizeTermBuffer(encodedLength);
+ }
+ termAtt.setTermLength(encodedLength);
+ CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
+ IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
+ return true;
+ } else {
+ return false;
+ }
+ }
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
===================================================================
--- contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (revision 776655)
+++ contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
import java.io.IOException;
@@ -64,6 +65,7 @@
public class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
+ private TermAttribute termAtt;
/**
*
@@ -73,7 +75,28 @@
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
super(input);
this.collator = collator;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ String termText = new String(termBuffer, 0, termAtt.termLength());
+ collator.getRawCollationKey(termText, reusableKey);
+ ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size);
+ int encodedLength
+ = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
+ if (encodedLength > termBuffer.length) {
+ termAtt.resizeTermBuffer(encodedLength);
+ }
+ termAtt.setTermLength(encodedLength);
+ CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
+ IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
+ return true;
+ } else {
+ return false;
+ }
+ }
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;