Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java (working copy) @@ -25,7 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** A grammar-based tokenizer constructed with JFlex @@ -134,30 +133,34 @@ clearAttributes(); skippedPositions = 0; - while(true) { - int tokenType = scanner.getNextToken(); - - if (tokenType == StandardTokenizerInterface.YYEOF) { - return false; + try { + while(true) { + int tokenType = scanner.getNextToken(); + + if (tokenType == StandardTokenizerInterface.YYEOF) { + return false; + } + + if (scanner.yylength() <= maxTokenLength) { + posIncrAtt.setPositionIncrement(skippedPositions+1); + scanner.getText(termAtt); + final int start = scanner.yychar(); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); + + if (tokenType == ClassicTokenizer.ACRONYM_DEP) { + typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]); + termAtt.setLength(termAtt.length() - 1); // remove extra '.' + } else { + typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]); + } + return true; + } else + // When we skip a too-long term, we still increment the + // position increment + skippedPositions++; } - - if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(skippedPositions+1); - scanner.getText(termAtt); - final int start = scanner.yychar(); - offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); - - if (tokenType == ClassicTokenizer.ACRONYM_DEP) { - typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]); - termAtt.setLength(termAtt.length() - 1); // remove extra '.' - } else { - typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]); - } - return true; - } else - // When we skip a too-long term, we still increment the - // position increment - skippedPositions++; + } catch (NullPointerException npe) { + throw new IllegalStateException(RESET_MISSING_MESSAGE); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -148,24 +148,28 @@ clearAttributes(); skippedPositions = 0; - while(true) { - int tokenType = scanner.getNextToken(); - - if (tokenType == StandardTokenizerInterface.YYEOF) { - return false; + try { + while(true) { + int tokenType = scanner.getNextToken(); + + if (tokenType == StandardTokenizerInterface.YYEOF) { + return false; + } + + if (scanner.yylength() <= maxTokenLength) { + posIncrAtt.setPositionIncrement(skippedPositions+1); + scanner.getText(termAtt); + final int start = scanner.yychar(); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); + typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); + return true; + } else + // When we skip a too-long term, we still increment the + // position increment + skippedPositions++; } - - if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(skippedPositions+1); - scanner.getText(termAtt); - final int start = scanner.yychar(); - offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); - typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); - return true; - } else - // When we skip a too-long term, we still increment the - // position increment - skippedPositions++; + } catch (NullPointerException npe) { + throw new IllegalStateException(RESET_MISSING_MESSAGE); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java (working copy) @@ -127,24 +127,28 @@ clearAttributes(); skippedPositions = 0; - while(true) { - int tokenType = scanner.getNextToken(); - - if (tokenType == StandardTokenizerInterface.YYEOF) { - return false; + try { + while(true) { + int tokenType = scanner.getNextToken(); + + if (tokenType == StandardTokenizerInterface.YYEOF) { + return false; + } + + if (scanner.yylength() <= maxTokenLength) { + posIncrAtt.setPositionIncrement(skippedPositions+1); + scanner.getText(termAtt); + final int start = scanner.yychar(); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); + typeAtt.setType(TOKEN_TYPES[tokenType]); + return true; + } else + // When we skip a too-long term, we still increment the + // position increment + skippedPositions++; } - - if (scanner.yylength() <= maxTokenLength) { - posIncrAtt.setPositionIncrement(skippedPositions+1); - scanner.getText(termAtt); - final int start = scanner.yychar(); - offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); - typeAtt.setType(TOKEN_TYPES[tokenType]); - return true; - } else - // When we skip a too-long term, we still increment the - // position increment - skippedPositions++; + } catch (NullPointerException npe) { + throw new IllegalStateException(RESET_MISSING_MESSAGE); } } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (revision 1525238) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (working copy) @@ -114,7 +114,15 @@ bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone - final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); + final int c; + try { + c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength()); + } catch (ArrayIndexOutOfBoundsException aioobe) { + if (bufferIndex == -1) { + throw new IllegalStateException(RESET_MISSING_MESSAGE); + } + throw aioobe; + } final int charCount = Character.charCount(c); bufferIndex += charCount; @@ -138,7 +146,6 @@ assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end)); return true; - } @Override Index: lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (revision 1525238) +++ lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) @@ -31,6 +31,10 @@ setting attributes. */ public abstract class Tokenizer extends TokenStream { + /** @lucene.internal */ + protected static final String RESET_MISSING_MESSAGE = "The TokenStream consumer did not call reset() before incrementToken(). " + + "Please see Javadocs of TokenStream class for more information about the correct consuming workflow."; + /** The text source for this Tokenizer. */ protected Reader input;