Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 813395)
+++ CHANGES.txt (working copy)
@@ -350,10 +350,12 @@
a top level reader and docID.
(Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless)
- * LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
- CharFilter and MappingCharFilter, which allows chaining & mapping
- of characters before tokenizers run. (Koji Sekiguchi via Mike
- McCandless)
+ * LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows
+ chaining & mapping of characters before tokenizers run. CharStream (subclass of
+ Reader) is the base class for custom java.io.Reader's, that support offset
+ correction. Tokenizers got an additional method correctOffset() that is passed
+ down to the underlying CharStream if input is a subclass of CharStream/-Filter.
+ (Koji Sekiguchi via Mike McCandless, Uwe Schindler)
* LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike
McCandless)
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (revision 813395)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (working copy)
@@ -285,7 +285,7 @@
if (length > 0) {
termAtt.setTermBuffer(buffer, 0, length);
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
return true;
} else if (dataLen == -1) {
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (revision 813395)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (working copy)
@@ -104,7 +104,7 @@
//System.out.println(new String(buffer, 0,
//length));
termAtt.setTermBuffer(buffer, 0, length);
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (revision 813395)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (working copy)
@@ -207,7 +207,7 @@
int start = side == Side.FRONT ? 0 : inLen - gramSize;
int end = start + gramSize;
termAtt.setTermBuffer(inStr, start, gramSize);
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(end));
gramSize++;
return true;
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 813395)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
@@ -124,7 +124,7 @@
int oldPos = pos;
pos++;
termAtt.setTermBuffer(inStr, oldPos, gramSize);
- offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
+ offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
return true;
}
Index: contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
===================================================================
--- contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (revision 813395)
+++ contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (working copy)
@@ -116,7 +116,7 @@
return false;
else {
termAtt.setTermBuffer(buffer.toString());
- offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd));
+ offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
typeAtt.setType("sentence");
return true;
}
Index: contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
===================================================================
--- contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 813395)
+++ contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy)
@@ -202,7 +202,7 @@
return false;
termAtt.setTermBuffer(snippet, startTerm, lenTerm);
- offsetAtt.setOffset(startOffset, startOffset + lenTerm);
+ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
return true;
}
Index: contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
===================================================================
--- contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (revision 813395)
+++ contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (working copy)
@@ -17,7 +17,6 @@
package org.apache.lucene.wikipedia.analysis;
-import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@@ -127,10 +126,6 @@
private TermAttribute termAtt;
private FlagsAttribute flagsAtt;
- void setInput(Reader reader) {
- this.input = CharReader.get(reader);
- }
-
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
* input to a newly created JFlex scanner.
@@ -267,7 +262,7 @@
//trim the buffer
String s = buffer.toString().trim();
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
- offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
+ offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -305,7 +300,7 @@
//trim the buffer
String s = buffer.toString().trim();
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
- offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
+ offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@@ -318,7 +313,7 @@
private void setupToken() {
scanner.getText(termAtt);
final int start = scanner.yychar();
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength()));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength()));
}
/*
@@ -332,7 +327,7 @@
}
public void reset(Reader reader) throws IOException {
- setInput(reader);
+ super.reset(reader);
reset();
}
Index: src/java/org/apache/lucene/analysis/BaseCharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/BaseCharFilter.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/BaseCharFilter.java (working copy)
@@ -43,7 +43,7 @@
/** Retrieve the corrected offset. Note that this method
* is slow, if you correct positions far before the most
* recently added position, as it's a simple linear
- * searhc backwards through all offset corrections added
+ * search backwards through all offset corrections added
* by {@link #addOffCorrectMap}. */
protected int correct(int currentOff) {
if (pcmList == null || pcmList.isEmpty()) {
Index: src/java/org/apache/lucene/analysis/CharFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharFilter.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/CharFilter.java (working copy)
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis;
+import java.io.Reader;
import java.io.IOException;
/**
@@ -33,6 +34,10 @@
input = in;
}
+ protected CharFilter(Reader in) {
+ input = CharReader.get(in);
+ }
+
/**
* Subclass may want to override to correct the current offset.
*
Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy)
@@ -104,13 +104,13 @@
}
termAtt.setTermLength(length);
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
public final void end() {
// set final offset
- int finalOffset = input.correctOffset(offset);
+ int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -76,8 +76,8 @@
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
- finalOffset = input.correctOffset(upto);
- offsetAtt.setOffset(input.correctOffset(0), finalOffset);
+ finalOffset = correctOffset(upto);
+ offsetAtt.setOffset(correctOffset(0), finalOffset);
return true;
}
return false;
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
@@ -20,7 +20,6 @@
import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -92,10 +91,6 @@
*/
private boolean replaceInvalidAcronym;
- void setInput(Reader reader) {
- input = CharReader.get(reader);
- }
-
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
@@ -152,7 +147,7 @@
private void init(Reader input, boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
- setInput(input);
+ this.input = input;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@@ -186,7 +181,7 @@
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
- offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
@@ -210,7 +205,7 @@
public final void end() {
// set final offset
- int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@@ -237,7 +232,7 @@
}
public void reset(Reader reader) throws IOException {
- setInput(reader);
+ super.reset(reader);
reset();
}
Index: src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 813395)
+++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy)
@@ -40,7 +40,7 @@
public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
- protected CharStream input;
+ protected Reader input;
/** Construct a tokenizer with null input. */
protected Tokenizer() {}
@@ -49,11 +49,6 @@
protected Tokenizer(Reader input) {
this.input = CharReader.get(input);
}
-
- /** Construct a token stream processing the given input. */
- protected Tokenizer(CharStream input) {
- this.input = input;
- }
/** Construct a tokenizer with null input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory) {
@@ -65,12 +60,6 @@
super(factory);
this.input = CharReader.get(input);
}
-
- /** Construct a token stream processing the given input using the given AttributeFactory. */
- protected Tokenizer(AttributeFactory factory, CharStream input) {
- super(factory);
- this.input = input;
- }
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source) {
@@ -83,28 +72,25 @@
this.input = CharReader.get(input);
}
- /** Construct a token stream processing the given input using the given AttributeSource. */
- protected Tokenizer(AttributeSource source, CharStream input) {
- super(source);
- this.input = input;
- }
-
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();
}
+
+ /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
+ * this method calls {@link CharStream#correctOffset}, else returns currentOff.
+ * @param currentOff offset as seen in the output
+ * @return corrected offset based on the input
+ * @see CharStream#correctOffset
+ */
+ protected final int correctOffset(int currentOff) {
+ return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
+ }
/** Expert: Reset the tokenizer to a new reader. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(Reader input) throws IOException {
- this.input = CharReader.get(input);
- }
-
- /** Expert: Reset the tokenizer to a new CharStream. Typically, an
- * analyzer (in its reusableTokenStream method) will use
- * this to re-use a previously created tokenizer. */
- public void reset(CharStream input) throws IOException {
this.input = input;
}
}