Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 813395) +++ CHANGES.txt (working copy) @@ -350,10 +350,12 @@ a top level reader and docID. (Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless) - * LUCENE-1466: Changed Tokenizer.input to be a CharStream; added - CharFilter and MappingCharFilter, which allows chaining & mapping - of characters before tokenizers run. (Koji Sekiguchi via Mike - McCandless) + * LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows + chaining & mapping of characters before tokenizers run. CharStream (subclass of + Reader) is the base class for custom java.io.Reader's, that support offset + correction. Tokenizers got an additional method correctOffset() that is passed + down to the underlying CharStream if input is a subclass of CharStream/-Filter. + (Koji Sekiguchi via Mike McCandless, Uwe Schindler) * LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike McCandless) Index: common-build.xml =================================================================== --- common-build.xml (revision 813395) +++ common-build.xml (working copy) @@ -42,7 +42,7 @@ - + Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 813395) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy) @@ -285,7 +285,7 @@ if (length > 0) { termAtt.setTermBuffer(buffer, 0, length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); return true; } else if (dataLen == -1) { Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 813395) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy) @@ -104,7 +104,7 @@ //System.out.println(new String(buffer, 0, //length)); termAtt.setTermBuffer(buffer, 0, length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } else Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 813395) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy) @@ -207,7 +207,7 @@ int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; termAtt.setTermBuffer(inStr, start, gramSize); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end)); + offsetAtt.setOffset(correctOffset(start), correctOffset(end)); gramSize++; return true; } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 813395) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) @@ -124,7 +124,7 @@ int oldPos = pos; pos++; termAtt.setTermBuffer(inStr, oldPos, gramSize); - offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize)); + offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); return true; } Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java =================================================================== --- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (revision 813395) +++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (working copy) @@ -116,7 +116,7 @@ return false; else { termAtt.setTermBuffer(buffer.toString()); - offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd)); + offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence"); return true; } Index: contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java =================================================================== --- contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 813395) +++ contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy) @@ -202,7 +202,7 @@ return false; termAtt.setTermBuffer(snippet, startTerm, lenTerm); - offsetAtt.setOffset(startOffset, startOffset + lenTerm); + offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; } Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java =================================================================== --- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (revision 813395) +++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (working copy) @@ -17,7 +17,6 @@ package org.apache.lucene.wikipedia.analysis; -import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; @@ -127,10 +126,6 @@ private TermAttribute termAtt; private FlagsAttribute flagsAtt; - void setInput(Reader reader) { - this.input = CharReader.get(reader); - } - /** * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the * input to a newly created JFlex scanner. @@ -267,7 +262,7 @@ //trim the buffer String s = buffer.toString().trim(); termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); - offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -305,7 +300,7 @@ //trim the buffer String s = buffer.toString().trim(); termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); - offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -318,7 +313,7 @@ private void setupToken() { scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength())); + offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength())); } /* @@ -332,7 +327,7 @@ } public void reset(Reader reader) throws IOException { - setInput(reader); + super.reset(reader); reset(); } Index: src/java/org/apache/lucene/analysis/BaseCharFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/BaseCharFilter.java (revision 813395) +++ src/java/org/apache/lucene/analysis/BaseCharFilter.java (working copy) @@ -43,7 +43,7 @@ /** Retrieve the corrected offset. Note that this method * is slow, if you correct positions far before the most * recently added position, as it's a simple linear - * searhc backwards through all offset corrections added + * search backwards through all offset corrections added * by {@link #addOffCorrectMap}. */ protected int correct(int currentOff) { if (pcmList == null || pcmList.isEmpty()) { Index: src/java/org/apache/lucene/analysis/CharFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CharFilter.java (revision 813395) +++ src/java/org/apache/lucene/analysis/CharFilter.java (working copy) @@ -21,6 +21,9 @@ /** * Subclasses of CharFilter can be chained to filter CharStream. + * They can be used as {@link java.io.Reader} with additional offset + * correction. {@link Tokenizer}s will automatically use {@link #correctOffset} + * if a CharFilter/CharStream subclass is used. * * @version $Id$ * Index: src/java/org/apache/lucene/analysis/CharStream.java =================================================================== --- src/java/org/apache/lucene/analysis/CharStream.java (revision 813395) +++ src/java/org/apache/lucene/analysis/CharStream.java (working copy) @@ -20,12 +20,11 @@ import java.io.Reader; /** - * CharStream adds correctOffset - * functionality over Reader. All Tokenizers accept a - * CharStream as input, which enables arbitrary character - * based filtering before tokenization. The {@link - * #correctOffset} method fixed offsets to account for + * CharStream adds {@link #correctOffset} + * functionality over {@link Reader}. All Tokenizers accept a + * CharStream instead of {@link Reader} as input, which enables + * arbitrary character based filtering before tokenization. + * The {@link #correctOffset} method fixed offsets to account for * removal or insertion of characters, so that the offsets * reported in the tokens match the character offsets of the * original Reader. Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -104,13 +104,13 @@ } termAtt.setTermLength(length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } public final void end() { // set final offset - int finalOffset = input.correctOffset(offset); + int finalOffset = correctOffset(offset); offsetAtt.setOffset(finalOffset, finalOffset); } Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -76,8 +76,8 @@ buffer = termAtt.resizeTermBuffer(1+buffer.length); } termAtt.setTermLength(upto); - finalOffset = input.correctOffset(upto); - offsetAtt.setOffset(input.correctOffset(0), finalOffset); + finalOffset = correctOffset(upto); + offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; Index: src/java/org/apache/lucene/analysis/MappingCharFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/MappingCharFilter.java (revision 813395) +++ src/java/org/apache/lucene/analysis/MappingCharFilter.java (working copy) @@ -18,6 +18,7 @@ package org.apache.lucene.analysis; import java.io.IOException; +import java.io.Reader; import java.util.LinkedList; /** @@ -35,11 +36,18 @@ private int charPointer; private int nextCharCounter; + /** Default constructor that takes a {@link CharStream}. */ public MappingCharFilter(NormalizeCharMap normMap, CharStream in) { super(in); this.normMap = normMap; } + /** Easy-use constructor that takes a {@link Reader}. */ + public MappingCharFilter(NormalizeCharMap normMap, Reader in) { + super(CharReader.get(in)); + this.normMap = normMap; + } + public int read() throws IOException { while(true) { if (replacement != null && charPointer < replacement.length()) { Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -92,10 +91,6 @@ */ private boolean replaceInvalidAcronym; - void setInput(Reader reader) { - input = CharReader.get(reader); - } - private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; /** Set the max allowed token length. Any token longer @@ -152,7 +147,7 @@ private void init(Reader input, boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; - setInput(input); + this.input = input; termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); @@ -186,7 +181,7 @@ posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength())); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. @@ -210,7 +205,7 @@ public final void end() { // set final offset - int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength()); + int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); } @@ -237,7 +232,7 @@ } public void reset(Reader reader) throws IOException { - setInput(reader); + super.reset(reader); reset(); } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -40,7 +40,7 @@ public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ - protected CharStream input; + protected Reader input; /** Construct a tokenizer with null input. */ protected Tokenizer() {} @@ -49,11 +49,6 @@ protected Tokenizer(Reader input) { this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input. */ - protected Tokenizer(CharStream input) { - this.input = input; - } /** Construct a tokenizer with null input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory) { @@ -65,12 +60,6 @@ super(factory); this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input using the given AttributeFactory. */ - protected Tokenizer(AttributeFactory factory, CharStream input) { - super(factory); - this.input = input; - } /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source) { @@ -83,28 +72,25 @@ this.input = CharReader.get(input); } - /** Construct a token stream processing the given input using the given AttributeSource. */ - protected Tokenizer(AttributeSource source, CharStream input) { - super(source); - this.input = input; - } - /** By default, closes the input Reader. */ public void close() throws IOException { input.close(); } + + /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass + * this method calls {@link CharStream#correctOffset}, else returns currentOff. + * @param currentOff offset as seen in the output + * @return corrected offset based on the input + * @see CharStream#correctOffset + */ + protected final int correctOffset(int currentOff) { + return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff; + } /** Expert: Reset the tokenizer to a new reader. Typically, an * analyzer (in its reusableTokenStream method) will use * this to re-use a previously created tokenizer. */ public void reset(Reader input) throws IOException { - this.input = CharReader.get(input); - } - - /** Expert: Reset the tokenizer to a new CharStream. Typically, an - * analyzer (in its reusableTokenStream method) will use - * this to re-use a previously created tokenizer. */ - public void reset(CharStream input) throws IOException { this.input = input; } } Index: src/test/org/apache/lucene/analysis/TestMappingCharFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestMappingCharFilter.java (revision 813395) +++ src/test/org/apache/lucene/analysis/TestMappingCharFilter.java (working copy) @@ -41,7 +41,7 @@ } public void testReaderReset() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); char[] buf = new char[10]; int len = cs.read(buf, 0, 10); assertEquals( 1, len ); @@ -57,55 +57,55 @@ } public void testNothingChange() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}); } public void test1to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}); } public void test1to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}); } public void test1to3() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}); } public void test2to4() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}); } public void test2to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}); } public void test3to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}); } public void test4to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}); } public void test5to0() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[0]); }