Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (revision 935541) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -58,7 +59,9 @@ Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(source); - result = new ThaiWordFilter(result); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new LowerCaseFilter(matchVersion, result); + result = new ThaiWordFilter(matchVersion, result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)); } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 935541) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -19,63 +19,88 @@ import java.io.IOException; import java.util.Locale; import java.lang.Character.UnicodeBlock; +import javax.swing.text.Segment; +import java.text.BreakIterator; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; -import java.text.BreakIterator; - /** * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each * Token that is Thai into separate Token(s) for each Thai word. - * @version 0.2 + *
Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text. + * {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter + * so the behaviour of the Analyzer does not change. */ public final class ThaiWordFilter extends TokenFilter { - private BreakIterator breaker = null; + private final BreakIterator breaker; + private final Segment charIterator; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; - - private State thaiState = null; + private final CharTermAttribute termAtt; + private final OffsetAttribute offsetAtt; + + private AttributeSource thaiClone = null; + private CharTermAttribute thaiTermAtt = null; + private boolean hasMoreTokensInClone = false; + @Deprecated public ThaiWordFilter(TokenStream input) { - super(input); + this(Version.LUCENE_30, input); + } + + public ThaiWordFilter(Version matchVersion, TokenStream input) { + super(matchVersion.onOrAfter(Version.LUCENE_31) ? + input : new LowerCaseFilter(matchVersion, input)); breaker = BreakIterator.getWordInstance(new Locale("th")); - termAtt = addAttribute(TermAttribute.class); + charIterator = new Segment(); + termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } @Override - public final boolean incrementToken() throws IOException { - if (thaiState != null) { + public boolean incrementToken() throws IOException { + if (hasMoreTokensInClone) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - restoreState(thaiState); - termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start); + thaiClone.copyTo(this); + termAtt.copyBuffer(thaiTermAtt.buffer(), start, end - start); offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end); return true; } - thaiState = null; + hasMoreTokensInClone = false; } - if (input.incrementToken() == false || termAtt.termLength() == 0) + if (!input.incrementToken()) { return false; - - String text = termAtt.term(); - if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - termAtt.setTermBuffer(text.toLowerCase()); + } + + if (termAtt.length() == 0 || UnicodeBlock.of(termAtt.charAt(0)) != UnicodeBlock.THAI) { return true; } - thaiState = captureState(); + hasMoreTokensInClone = true; + if (thaiClone == null) { + thaiClone = cloneAttributes(); + thaiTermAtt = thaiClone.getAttribute(CharTermAttribute.class); + } else { + this.copyTo(thaiClone); + } + + charIterator.array = thaiTermAtt.buffer(); + charIterator.offset = 0; + charIterator.count = thaiTermAtt.length(); + breaker.setText(charIterator); - breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - termAtt.setTermBuffer(text, 0, end); + termAtt.setLength(end); offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); return true; } @@ -85,6 +110,6 @@ @Override public void reset() throws IOException { super.reset(); - thaiState = null; + hasMoreTokensInClone = false; } }