Index: lucene/contrib/analyzers/common/build.xml =================================================================== --- lucene/contrib/analyzers/common/build.xml (revision 945106) +++ lucene/contrib/analyzers/common/build.xml (working copy) @@ -36,11 +36,13 @@ - + + + - - + + JFlex 1.4.1 - * on 4/15/08 4:31 AM from the specification file - * /mnt2/mike/src/lucene.clean/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex + * JFlex 1.5.0-SNAPSHOT + * on 17.05.10 14:43 from the specification file + * C:/Users/Uwe Schindler/Projects/lucene/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex */ class WikipediaTokenizerImpl { @@ -37,17 +37,28 @@ private static final int ZZ_BUFFERSIZE = 16384; /** lexical states */ - public static final int DOUBLE_BRACE_STATE = 8; - public static final int INTERNAL_LINK_STATE = 2; - public static final int TWO_SINGLE_QUOTES_STATE = 4; - public static final int CATEGORY_STATE = 1; - public static final int FIVE_SINGLE_QUOTES_STATE = 6; - public static final int STRING = 9; + public static final int CATEGORY_STATE = 2; + public static final int DOUBLE_EQUALS_STATE = 14; + public static final int EXTERNAL_LINK_STATE = 6; + public static final int INTERNAL_LINK_STATE = 4; + public static final int DOUBLE_BRACE_STATE = 16; + public static final int FIVE_SINGLE_QUOTES_STATE = 12; + public static final int STRING = 18; + public static final int TWO_SINGLE_QUOTES_STATE = 8; public static final int YYINITIAL = 0; - public static final int DOUBLE_EQUALS_STATE = 7; - public static final int THREE_SINGLE_QUOTES_STATE = 5; - public static final int EXTERNAL_LINK_STATE = 3; + public static final int THREE_SINGLE_QUOTES_STATE = 10; + /** + * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l + * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l + * at the beginning of a line + * l is of the form l = 2*k, k a non negative integer + */ + private static final int ZZ_LEXSTATE[] = { + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9 + }; + /** * Translates characters to character classes */ @@ -390,9 +401,6 @@ /** the textposition at the last accepting state */ private int zzMarkedPos; - /** the textposition at the last state to be included in yytext */ - private int zzPushbackPos; - /** the current text position in the buffer */ private int zzCurrentPos; @@ -423,6 +431,9 @@ /** zzAtEOF == true <=> the scanner is at the EOF */ private boolean zzAtEOF; + /** denotes if the user-EOF-code has already been executed */ + private boolean zzEOFDone; + /* user code: */ public static final int ALPHANUM = WikipediaTokenizer.ALPHANUM_ID; @@ -547,7 +558,6 @@ zzEndRead-= zzStartRead; zzCurrentPos-= zzStartRead; zzMarkedPos-= zzStartRead; - zzPushbackPos-= zzStartRead; zzStartRead = 0; } @@ -563,13 +573,23 @@ int numRead = zzReader.read(zzBuffer, zzEndRead, zzBuffer.length-zzEndRead); - if (numRead < 0) { - return true; - } - else { + if (numRead > 0) { zzEndRead+= numRead; return false; } + // unlikely but not impossible: read 0 characters, but not at end of stream + if (numRead == 0) { + int c = zzReader.read(); + if (c == -1) { + return true; + } else { + zzBuffer[zzEndRead++] = (char) c; + return false; + } + } + + // numRead < 0 + return true; } @@ -593,16 +613,21 @@ * cannot be reused (internal buffer is discarded and lost). * Lexical state is set to ZZ_INITIAL. * + * Internal scan buffer is resized down to its initial length, if it has grown. + * * @param reader the new input stream */ public final void yyreset(java.io.Reader reader) { zzReader = reader; zzAtBOL = true; zzAtEOF = false; + zzEOFDone = false; zzEndRead = zzStartRead = 0; - zzCurrentPos = zzMarkedPos = zzPushbackPos = 0; + zzCurrentPos = zzMarkedPos = 0; yyline = yychar = yycolumn = 0; zzLexicalState = YYINITIAL; + if (zzBuffer.length > ZZ_BUFFERSIZE) + zzBuffer = new char[ZZ_BUFFERSIZE]; } @@ -730,7 +755,7 @@ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; - zzState = zzLexicalState; + zzState = ZZ_LEXSTATE[zzLexicalState]; zzForAction: { @@ -778,184 +803,184 @@ zzMarkedPos = zzMarkedPosL; switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { - case 8: - { /* ignore */ + case 25: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE); } case 46: break; - case 28: - { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); + case 30: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/ } case 47: break; - case 3: - { positionInc = 1; return CJ; + case 41: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/ } case 48: break; - case 30: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/ + case 14: + { yybegin(STRING); numWikiTokensSeen++; return currentTokType; } case 49: break; - case 10: - { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); + case 23: + { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE); } case 50: break; - case 41: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/ + case 34: + { positionInc = 1; return NUM; } case 51: break; - case 7: - { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType; + case 18: + { /* ignore STRING */ } case 52: break; - case 23: - { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE); + case 12: + { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/ } case 53: break; - case 38: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/ + case 37: + { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/ } case 54: break; - case 17: - { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType; + case 31: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); } case 55: break; - case 24: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE); + case 10: + { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); } case 56: break; - case 14: - { yybegin(STRING); numWikiTokensSeen++; return currentTokType; + case 38: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/ } case 57: break; - case 5: - { positionInc = 1; + case 19: + { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/ } case 58: break; - case 43: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE); + case 11: + { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); } case 59: break; - case 26: - { yybegin(YYINITIAL); + case 1: + { numWikiTokensSeen = 0; positionInc = 1; } case 60: break; - case 20: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); + case 33: + { positionInc = 1; return HOST; } case 61: break; - case 1: - { numWikiTokensSeen = 0; positionInc = 1; + case 3: + { positionInc = 1; return CJ; } case 62: break; - case 40: - { positionInc = 1; return EMAIL; + case 17: + { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType; } case 63: break; - case 25: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE); + case 32: + { positionInc = 1; return APOSTROPHE; } case 64: break; - case 39: - { positionInc = 1; return ACRONYM; + case 8: + { /* ignore */ } case 65: break; - case 9: - { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType; + case 4: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE); } case 66: break; - case 22: - { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;} + case 2: + { positionInc = 1; return ALPHANUM; } case 67: break; - case 31: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); + case 26: + { yybegin(YYINITIAL); } case 68: break; - case 15: - { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); + case 43: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE); } case 69: break; - case 18: - { /* ignore STRING */ + case 36: + { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); } case 70: break; - case 42: - { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType; + case 13: + { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); } case 71: break; - case 21: - { yybegin(STRING); return currentTokType;/*pipe*/ + case 24: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE); } case 72: break; - case 37: - { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/ + case 27: + { numLinkToks = 0; yybegin(YYINITIAL); } case 73: break; - case 33: - { positionInc = 1; return HOST; + case 15: + { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); } case 74: break; - case 45: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); + case 28: + { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); } case 75: break; - case 36: - { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); + case 39: + { positionInc = 1; return ACRONYM; } case 76: break; - case 13: - { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); + case 29: + { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); } case 77: break; + case 7: + { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType; + } + case 78: break; case 16: { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType; } - case 78: break; - case 12: - { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/ - } case 79: break; - case 6: - { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType; + case 20: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); } case 80: break; - case 32: - { positionInc = 1; return APOSTROPHE; + case 35: + { positionInc = 1; return COMPANY; } case 81: break; - case 19: - { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/ + case 40: + { positionInc = 1; return EMAIL; } case 82: break; - case 34: - { positionInc = 1; return NUM; + case 42: + { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType; } case 83: break; + case 6: + { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType; + } + case 84: break; case 44: { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); } - case 84: break; - case 2: - { positionInc = 1; return ALPHANUM; - } case 85: break; - case 35: - { positionInc = 1; return COMPANY; + case 5: + { positionInc = 1; } case 86: break; - case 11: - { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); + case 9: + { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType; } case 87: break; - case 29: - { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); + case 45: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); } case 88: break; - case 4: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE); + case 22: + { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;} } case 89: break; - case 27: - { numLinkToks = 0; yybegin(YYINITIAL); + case 21: + { yybegin(STRING); return currentTokType;/*pipe*/ } case 90: break; default: Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (revision 945106) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (working copy) @@ -17,12 +17,12 @@ * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; %% %class WikipediaTokenizerImpl -%unicode +%unicode 3.0 %integer %function getNextToken %pack @@ -81,7 +81,7 @@ /** * Fills Lucene token with the current token text. */ -final void getText(Token t) { +final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } Index: lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/READ_BEFORE_REGENERATING.txt (working copy) @@ -17,4 +17,5 @@ WARNING: if you change StandardTokenizerImpl*.jflex and need to regenerate - the tokenizer, only use the trunk version of JFlex 1.5 at the moment! + the tokenizer, only use the trunk version of JFlex 1.5 (with a minimum + SVN revision 591) at the moment! Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -201,7 +201,7 @@ @Override public void reset(Reader reader) throws IOException { super.reset(reader); - scanner.reset(reader); + scanner.yyreset(reader); } /** Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.java (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10.04.10 13:07 */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:35 */ package org.apache.lucene.analysis.standard; @@ -33,8 +33,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 10.04.10 13:07 from the specification file - * C:/Users/Uwe Schindler/Projects/lucene/trunk-full1/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex + * on 17.05.10 14:35 from the specification file + * C:/Users/Uwe Schindler/Projects/lucene/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex */ class StandardTokenizerImpl31 implements StandardTokenizerInterface { @@ -379,19 +379,8 @@ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } -/** - * Resets the Tokenizer to a new Reader. - */ -public final void reset(Reader r) { - // reset to default buffer size, if buffer has grown - if (zzBuffer.length > ZZ_BUFFERSIZE) { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - yyreset(r); -} - /** * Creates a new scanner * There is also a java.io.InputStream version of this constructor. @@ -505,6 +494,8 @@ * cannot be reused (internal buffer is discarded and lost). * Lexical state is set to ZZ_INITIAL. * + * Internal scan buffer is resized down to its initial length, if it has grown. + * * @param reader the new input stream */ public final void yyreset(java.io.Reader reader) { @@ -516,6 +507,8 @@ zzCurrentPos = zzMarkedPos = 0; yyline = yychar = yycolumn = 0; zzLexicalState = YYINITIAL; + if (zzBuffer.length > ZZ_BUFFERSIZE) + zzBuffer = new char[ZZ_BUFFERSIZE]; } Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex (working copy) @@ -67,17 +67,6 @@ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } -/** - * Resets the Tokenizer to a new Reader. - */ -public final void reset(Reader r) { - // reset to default buffer size, if buffer has grown - if (zzBuffer.length > ZZ_BUFFERSIZE) { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - yyreset(r); -} - %} THAI = [\u0E00-\u0E59] Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.java (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10.04.10 13:07 */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:35 */ package org.apache.lucene.analysis.standard; @@ -33,8 +33,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 10.04.10 13:07 from the specification file - * C:/Users/Uwe Schindler/Projects/lucene/trunk-full1/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex + * on 17.05.10 14:35 from the specification file + * C:/Users/Uwe Schindler/Projects/lucene/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex */ class StandardTokenizerImplOrig implements StandardTokenizerInterface { @@ -375,19 +375,8 @@ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } -/** - * Resets the Tokenizer to a new Reader. - */ -public final void reset(Reader r) { - // reset to default buffer size, if buffer has grown - if (zzBuffer.length > ZZ_BUFFERSIZE) { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - yyreset(r); -} - /** * Creates a new scanner * There is also a java.io.InputStream version of this constructor. @@ -501,6 +490,8 @@ * cannot be reused (internal buffer is discarded and lost). * Lexical state is set to ZZ_INITIAL. * + * Internal scan buffer is resized down to its initial length, if it has grown. + * * @param reader the new input stream */ public final void yyreset(java.io.Reader reader) { @@ -512,6 +503,8 @@ zzCurrentPos = zzMarkedPos = 0; yyline = yychar = yycolumn = 0; zzLexicalState = YYINITIAL; + if (zzBuffer.length > ZZ_BUFFERSIZE) + zzBuffer = new char[ZZ_BUFFERSIZE]; } Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImplOrig.jflex (working copy) @@ -67,17 +67,6 @@ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } -/** - * Resets the Tokenizer to a new Reader. - */ -public final void reset(Reader r) { - // reset to default buffer size, if buffer has grown - if (zzBuffer.length > ZZ_BUFFERSIZE) { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - yyreset(r); -} - %} THAI = [\u0E00-\u0E59] Index: lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java (revision 945106) +++ lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerInterface.java (working copy) @@ -47,7 +47,7 @@ * * @param reader the new input stream */ - void reset(Reader reader); + void yyreset(Reader reader); /** * Returns the length of the matched text region.