Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (revision 935541) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -59,6 +60,7 @@ final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(source); result = new ThaiWordFilter(result); + result = new LowerCaseFilter(matchVersion, result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)); } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 935541) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy) @@ -19,63 +19,68 @@ import java.io.IOException; import java.util.Locale; import java.lang.Character.UnicodeBlock; +import javax.swing.text.Segment; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.text.BreakIterator; /** * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each * Token that is Thai into separate Token(s) for each Thai word. - * @version 0.2 */ public final class ThaiWordFilter extends TokenFilter { - private BreakIterator breaker = null; + private final BreakIterator breaker; + private final Segment charIterator; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; + private final CharTermAttribute termAtt; + private final OffsetAttribute offsetAtt; private State thaiState = null; public ThaiWordFilter(TokenStream input) { super(input); breaker = BreakIterator.getWordInstance(new Locale("th")); - termAtt = addAttribute(TermAttribute.class); + charIterator = new Segment(); + termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } @Override - public final boolean incrementToken() throws IOException { + public boolean incrementToken() throws IOException { if (thaiState != null) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { restoreState(thaiState); - termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start); + termAtt.copyBuffer(termAtt.buffer(), start, end - start); offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end); return true; } thaiState = null; } - if (input.incrementToken() == false || termAtt.termLength() == 0) + if (!input.incrementToken()) { return false; - - String text = termAtt.term(); - if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - termAtt.setTermBuffer(text.toLowerCase()); + } + + if (termAtt.length() == 0 || UnicodeBlock.of(termAtt.charAt(0)) != UnicodeBlock.THAI) { return true; } thaiState = captureState(); - breaker.setText(text); + charIterator.array = termAtt.buffer(); + charIterator.offset = 0; + charIterator.count = termAtt.length(); + breaker.setText(charIterator); + int end = breaker.next(); if (end != BreakIterator.DONE) { - termAtt.setTermBuffer(text, 0, end); + termAtt.setLength(end); offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); return true; }