Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 813395) +++ CHANGES.txt (working copy) @@ -350,10 +350,12 @@ a top level reader and docID. (Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless) - * LUCENE-1466: Changed Tokenizer.input to be a CharStream; added - CharFilter and MappingCharFilter, which allows chaining & mapping - of characters before tokenizers run. (Koji Sekiguchi via Mike - McCandless) + * LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows + chaining & mapping of characters before tokenizers run. CharStream (subclass of + Reader) is the base class for custom java.io.Reader's, that support offset + correction. Tokenizers got an additional method correctOffset() that is passed + down to the underlying CharStream if input is a subclass of CharStream/-Filter. + (Koji Sekiguchi via Mike McCandless, Uwe Schindler) * LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike McCandless) Index: src/java/org/apache/lucene/analysis/BaseCharFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/BaseCharFilter.java (revision 813395) +++ src/java/org/apache/lucene/analysis/BaseCharFilter.java (working copy) @@ -43,7 +43,7 @@ /** Retrieve the corrected offset. Note that this method * is slow, if you correct positions far before the most * recently added position, as it's a simple linear - * searhc backwards through all offset corrections added + * search backwards through all offset corrections added * by {@link #addOffCorrectMap}. */ protected int correct(int currentOff) { if (pcmList == null || pcmList.isEmpty()) { Index: src/java/org/apache/lucene/analysis/CharFilter.java =================================================================== --- src/java/org/apache/lucene/analysis/CharFilter.java (revision 813395) +++ src/java/org/apache/lucene/analysis/CharFilter.java (working copy) @@ -17,6 +17,7 @@ package org.apache.lucene.analysis; +import java.io.Reader; import java.io.IOException; /** @@ -33,6 +34,10 @@ input = in; } + protected CharFilter(Reader in) { + input = CharReader.get(in); + } + /** * Subclass may want to override to correct the current offset. * Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -104,13 +104,13 @@ } termAtt.setTermLength(length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } public final void end() { // set final offset - int finalOffset = input.correctOffset(offset); + int finalOffset = correctOffset(offset); offsetAtt.setOffset(finalOffset, finalOffset); } Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy) @@ -76,8 +76,8 @@ buffer = termAtt.resizeTermBuffer(1+buffer.length); } termAtt.setTermLength(upto); - finalOffset = input.correctOffset(upto); - offsetAtt.setOffset(input.correctOffset(0), finalOffset); + finalOffset = correctOffset(upto); + offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -186,7 +186,7 @@ posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength())); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. @@ -210,7 +210,7 @@ public final void end() { // set final offset - int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength()); + int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); } Index: src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 813395) +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -40,7 +40,7 @@ public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ - protected CharStream input; + protected Reader input; /** Construct a tokenizer with null input. */ protected Tokenizer() {} @@ -49,11 +49,6 @@ protected Tokenizer(Reader input) { this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input. */ - protected Tokenizer(CharStream input) { - this.input = input; - } /** Construct a tokenizer with null input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory) { @@ -65,12 +60,6 @@ super(factory); this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input using the given AttributeFactory. */ - protected Tokenizer(AttributeFactory factory, CharStream input) { - super(factory); - this.input = input; - } /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source) { @@ -83,28 +72,25 @@ this.input = CharReader.get(input); } - /** Construct a token stream processing the given input using the given AttributeSource. */ - protected Tokenizer(AttributeSource source, CharStream input) { - super(source); - this.input = input; - } - /** By default, closes the input Reader. */ public void close() throws IOException { input.close(); } + + /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass + * this method calls {@link CharStream#correctOffset}, else returns currentOff. + * @param currentOff offset as seen in the output + * @return corrected offset based on the input + * @see CharStream#correctOffset + */ + protected final int correctOffset(int currentOff) { + return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff; + } /** Expert: Reset the tokenizer to a new reader. Typically, an * analyzer (in its reusableTokenStream method) will use * this to re-use a previously created tokenizer. */ public void reset(Reader input) throws IOException { - this.input = CharReader.get(input); - } - - /** Expert: Reset the tokenizer to a new CharStream. Typically, an - * analyzer (in its reusableTokenStream method) will use - * this to re-use a previously created tokenizer. */ - public void reset(CharStream input) throws IOException { this.input = input; } }