Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -88,6 +88,7 @@ @Override public void reset() throws IOException { + super.reset(); this.done = false; } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java (working copy) @@ -140,7 +140,8 @@ } @Override - public void end() { + public void end() throws IOException { + super.end(); // set final offset final int finalOffset = correctOffset(charsRead); this.offsetAtt.setOffset(finalOffset, finalOffset); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy) @@ -138,6 +138,7 @@ @Override public void reset() throws IOException { + super.reset(); fillBuffer(str, input); matcher.reset(str); index = 0; Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java (working copy) @@ -114,7 +114,7 @@ } private void init(Version matchVersion) { - this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset + this.scanner = new ClassicTokenizerImpl(input); } // this tokenizer generates three attributes: @@ -170,9 +170,16 @@ // adjust any skipped tokens posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions); } + + @Override + public void close() throws IOException { + super.close(); + scanner.yyreset(input); + } @Override public void reset() throws IOException { + super.reset(); scanner.yyreset(input); skippedPositions = 0; } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -128,7 +128,7 @@ } private final void init(Version matchVersion) { - this.scanner = new StandardTokenizerImpl(null); // best effort NPE if you dont call reset + this.scanner = new StandardTokenizerImpl(input); } // this tokenizer generates three attributes: @@ -180,7 +180,14 @@ } @Override + public void close() throws IOException { + super.close(); + scanner.yyreset(input); + } + + @Override public void reset() throws IOException { + super.reset(); scanner.yyreset(input); skippedPositions = 0; } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (working copy) @@ -111,8 +111,8 @@ this.scanner = getScannerFor(matchVersion); } - private static StandardTokenizerInterface getScannerFor(Version matchVersion) { - return new UAX29URLEmailTokenizerImpl(null); // best effort NPE if you dont call reset + private StandardTokenizerInterface getScannerFor(Version matchVersion) { + return new UAX29URLEmailTokenizerImpl(input); } // this tokenizer generates three attributes: @@ -157,9 +157,16 @@ // adjust any skipped tokens posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+skippedPositions); } + + @Override + public void close() throws IOException { + super.close(); + scanner.yyreset(input); + } @Override public void reset() throws IOException { + super.reset(); scanner.yyreset(input); skippedPositions = 0; } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (working copy) @@ -62,8 +62,7 @@ charUtils = CharacterUtils.getInstance(matchVersion); } - // note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset() - private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0; + private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; @@ -150,6 +149,7 @@ @Override public void reset() throws IOException { + super.reset(); bufferIndex = 0; offset = 0; dataLen = 0; Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (working copy) @@ -143,7 +143,7 @@ */ public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) { super(input); - this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset + this.scanner = new WikipediaTokenizerImpl(input); init(tokenOutput, untokenizedTypes); } @@ -295,6 +295,12 @@ offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); } + @Override + public void close() throws IOException { + super.close(); + scanner.yyreset(input); + } + /* * (non-Javadoc) * @@ -302,6 +308,7 @@ */ @Override public void reset() throws IOException { + super.reset(); scanner.yyreset(input); tokens = null; scanner.reset(); Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (revision 1525238) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (working copy) @@ -45,8 +45,7 @@ /** true length of text in the buffer */ private int length = 0; /** length in buffer that can be evaluated safely, up to a safe end point */ - // note: usableLength is -1 here to best-effort AIOOBE consumers that don't call reset() - private int usableLength = -1; + private int usableLength = 0; /** accumulated offset of previous buffers for this reader, for offsetAtt */ private int offset = 0; Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (revision 1525238) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (working copy) @@ -243,7 +243,7 @@ outputCompounds = false; break; } - buffer.reset(null); // best effort NPE consumers that don't call reset() + buffer.reset(input); // best effort NPE consumers that don't call reset() resetState(); @@ -261,7 +261,14 @@ } @Override + public void close() throws IOException { + super.close(); + buffer.reset(input); + } + + @Override public void reset() throws IOException { + super.reset(); buffer.reset(input); resetState(); } Index: lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java =================================================================== --- lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (revision 1525238) +++ lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (working copy) @@ -108,6 +108,7 @@ @Override public void reset() throws IOException { + super.reset(); tokenStart = tokenEnd = 0; } Index: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java =================================================================== --- lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (revision 1525238) +++ lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (working copy) @@ -89,6 +89,7 @@ @Override public void reset() throws IOException { + super.reset(); iterator = null; } } Index: lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (revision 1525238) +++ lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -30,21 +30,28 @@ call {@link AttributeSource#clearAttributes()} before setting attributes. */ -public abstract class Tokenizer extends TokenStream { +public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ - protected Reader input; + protected Reader input = ILLEGAL_STATE_READER; + + /** Pending reader: not actually assigned to input until reset() */ + private Reader inputPending = ILLEGAL_STATE_READER; /** Construct a token stream processing the given input. */ protected Tokenizer(Reader input) { - assert input != null: "input must not be null"; - this.input = input; + if (input == null) { + throw new NullPointerException("input must not be null"); + } + this.inputPending = input; } /** Construct a token stream processing the given input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory, Reader input) { super(factory); - assert input != null: "input must not be null"; - this.input = input; + if (input == null) { + throw new NullPointerException("input must not be null"); + } + this.inputPending = input; } /** @@ -56,12 +63,10 @@ */ @Override public void close() throws IOException { - if (input != null) { - input.close(); - // LUCENE-2387: don't hold onto Reader after close, so - // GC can reclaim - input = null; - } + input.close(); + // LUCENE-2387: don't hold onto Reader after close, so + // GC can reclaim + inputPending = input = ILLEGAL_STATE_READER; } /** Return the corrected offset. If {@link #input} is a {@link CharFilter} subclass @@ -71,7 +76,6 @@ * @see CharFilter#correctOffset */ protected final int correctOffset(int currentOff) { - assert input != null: "this tokenizer is closed"; return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff; } @@ -79,14 +83,36 @@ * analyzer (in its tokenStream method) will use * this to re-use a previously created tokenizer. */ public final void setReader(Reader input) throws IOException { - assert input != null: "input must not be null"; - this.input = input; + if (input == null) { + throw new NullPointerException("input must not be null"); + } + this.input = ILLEGAL_STATE_READER; + this.inputPending = input; assert setReaderTestPoint(); } + @Override + public void reset() throws IOException { + super.reset(); + input = inputPending; + inputPending = ILLEGAL_STATE_READER; + } + // only used by assert, for testing boolean setReaderTestPoint() { return true; } + + private static final Reader ILLEGAL_STATE_READER = new Reader() { + @Override + public int read(char[] cbuf, int off, int len) { + throw new IllegalStateException("TokenStream contract violation: reset()/close() call missing, " + + "reset() called multiple times, or subclass does not call super.reset(). " + + "Please see Javadocs of TokenStream class for more information about the correct consuming workflow."); + } + + @Override + public void close() {} + }; } Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java =================================================================== --- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 1525238) +++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy) @@ -68,7 +68,8 @@ } @Override - public void reset() { + public void reset() throws IOException { + super.reset(); tokens = null; upto = 0; } Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 1525238) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -1599,14 +1599,15 @@ @Override public void reset() throws IOException { - this.upto = 0; - final StringBuilder b = new StringBuilder(); - final char[] buffer = new char[1024]; - int n; - while ((n = input.read(buffer)) != -1) { - b.append(buffer, 0, n); - } - this.tokens = b.toString().split(" "); + super.reset(); + this.upto = 0; + final StringBuilder b = new StringBuilder(); + final char[] buffer = new char[1024]; + int n; + while ((n = input.read(buffer)) != -1) { + b.append(buffer, 0, n); + } + this.tokens = b.toString().split(" "); } } Index: lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java (revision 1525238) +++ lucene/core/src/test/org/apache/lucene/search/TestTermRangeQuery.java (working copy) @@ -227,7 +227,8 @@ } @Override - public void reset() throws IOException {; + public void reset() throws IOException { + super.reset(); done = false; } } Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 1525238) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy) @@ -319,7 +319,8 @@ } @Override - public void reset() { + public void reset() throws IOException { + super.reset(); startTerm = 0; nextStartOffset = 0; snippet = null; Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java =================================================================== --- lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java (revision 1525238) +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiPhraseQueryParsing.java (working copy) @@ -82,6 +82,7 @@ @Override public void reset() throws IOException { + super.reset(); this.upto = 0; this.lastPos = 0; } Index: lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (revision 1525238) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (working copy) @@ -341,14 +341,17 @@ } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + checkResetException(a, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { + checkResetException(a, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException { + checkResetException(a, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect); } @@ -378,6 +381,7 @@ public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { + checkResetException(a, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } @@ -400,6 +404,27 @@ public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { assertAnalyzesToReuse(a, input, output, startOffsets, endOffsets, null, posIncrements); } + + static void checkResetException(Analyzer a, String input) throws IOException { + TokenStream ts = a.tokenStream("bogus", input); + try { + ts.incrementToken(); + fail("didn't get expected exception when reset() not called"); + } catch (IllegalStateException expected) { + // ok + } catch (AssertionError expected) { + // ok: MockTokenizer + assertTrue(expected.getMessage(), expected.getMessage().contains("wrong state")); + } catch (Exception unexpected) { + fail("got wrong exception when reset() not called: " + unexpected); + } finally { + // consume correctly + ts.reset(); + while (ts.incrementToken()) {} + ts.end(); + ts.close(); + } + } // simple utility method for testing stemmers Index: solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java (revision 1525238) +++ solr/core/src/java/org/apache/solr/analysis/TrieTokenizerFactory.java (working copy) @@ -96,8 +96,9 @@ } @Override - public void reset() { - try { + public void reset() throws IOException { + super.reset(); + try { int upto = 0; char[] buf = termAtt.buffer(); while (true) { @@ -167,6 +168,7 @@ @Override public void end() throws IOException { + super.end(); if (hasValue) { ts.end(); } Index: solr/core/src/java/org/apache/solr/schema/BoolField.java =================================================================== --- solr/core/src/java/org/apache/solr/schema/BoolField.java (revision 1525238) +++ solr/core/src/java/org/apache/solr/schema/BoolField.java (working copy) @@ -74,6 +74,7 @@ @Override public void reset() throws IOException { + super.reset(); done = false; } Index: solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java =================================================================== --- solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java (revision 1525238) +++ solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java (working copy) @@ -288,6 +288,7 @@ @Override public final void reset() throws IOException { + super.reset(); // NOTE: this acts like rewind if you call it again if (input != lastReader) { lastReader = input;