Index: modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (revision 1177458) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (working copy) @@ -64,7 +64,7 @@ public void testNoStopwords() throws Exception { // Note: an empty list of fields passed in - protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.EMPTY_LIST, 1); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Collections.emptyList(), 1); TokenStream protectedTokenStream = protectedAnalyzer.tokenStream("variedField", new StringReader("quick")); assertTokenStreamContents(protectedTokenStream, new String[]{"quick"}); Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy) @@ -189,4 +189,4 @@ // WB3b. ÷ (Newline | CR | LF) // WB14. Any ÷ Any // -[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } +[^] { break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (working copy) @@ -277,4 +277,4 @@ // WB3b. ÷ (Newline | CR | LF) // WB14. Any ÷ Any // -[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } +[^] { break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:01 AM */ package org.apache.lucene.analysis.standard; @@ -1066,17 +1066,17 @@ { return SOUTH_EAST_ASIAN_TYPE; } case 10: break; + case 1: + { break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ + } + case 11: break; case 4: { return KATAKANA_TYPE; } - case 11: break; + case 12: break; case 6: { return IDEOGRAPHIC_TYPE; } - case 12: break; - case 1: - { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ - } case 13: break; case 8: { return HANGUL_TYPE; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 4:07 PM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:01 AM */ package org.apache.lucene.analysis.standard.std31; @@ -34,8 +34,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 8/4/11 4:07 PM from the specification file - * /home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex + * on 9/30/11 6:01 AM from the specification file + * /lucene/reopenifneeded/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex */ public final class StandardTokenizerImpl31 implements StandardTokenizerInterface { @@ -1047,17 +1047,17 @@ { return SOUTH_EAST_ASIAN_TYPE; } case 10: break; + case 1: + { break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ + } + case 11: break; case 4: { return KATAKANA_TYPE; } - case 11: break; + case 12: break; case 6: { return IDEOGRAPHIC_TYPE; } - case 12: break; - case 1: - { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ - } case 13: break; case 8: { return HANGUL_TYPE; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 7:33 PM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:02 AM */ package org.apache.lucene.analysis.standard.std31; @@ -34,8 +34,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 8/4/11 7:33 PM from the specification file - * /home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex + * on 9/30/11 6:02 AM from the specification file + * /lucene/reopenifneeded/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex */ public final class UAX29URLEmailTokenizerImpl31 implements StandardTokenizerInterface { @@ -3638,17 +3638,17 @@ { return IDEOGRAPHIC_TYPE; } case 16: break; - case 1: - { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ - } - case 17: break; case 8: { return HANGUL_TYPE; } - case 18: break; + case 17: break; case 3: { return NUMERIC_TYPE; } + case 18: break; + case 1: + { break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ + } case 19: break; case 7: { return HIRAGANA_TYPE; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex (working copy) @@ -181,4 +181,4 @@ // WB3b. ÷ (Newline | CR | LF) // WB14. Any ÷ Any // -[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } +[^] { break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/std31/UAX29URLEmailTokenizerImpl31.jflex (working copy) @@ -266,4 +266,4 @@ // WB3b. ÷ (Newline | CR | LF) // WB14. Any ÷ Any // -[^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } +[^] { break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/4/11 7:48 PM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:02 AM */ package org.apache.lucene.analysis.standard; @@ -3728,17 +3728,17 @@ { return IDEOGRAPHIC_TYPE; } case 16: break; - case 1: - { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ - } - case 17: break; case 8: { return HANGUL_TYPE; } - case 18: break; + case 17: break; case 3: { return NUMERIC_TYPE; } + case 18: break; + case 1: + { break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ + } case 19: break; case 7: { return HIRAGANA_TYPE; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex (working copy) @@ -127,4 +127,4 @@ {ACRONYM_DEP} { return ACRONYM_DEP; } /** Ignore the rest */ -. | {WHITESPACE} { /* ignore */ } +. | {WHITESPACE} { break;/* ignore */ } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:01 AM */ package org.apache.lucene.analysis.standard; @@ -33,8 +33,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 2/9/11 11:45 AM from the specification file - * C:/Users/rmuir/workspace/lucene-2911/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex + * on 9/30/11 6:01 AM from the specification file + * /lucene/reopenifneeded/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex */ class ClassicTokenizerImpl implements StandardTokenizerInterface { @@ -694,13 +694,13 @@ { return HOST; } case 13: break; - case 1: - { /* ignore */ - } - case 14: break; case 8: { return ACRONYM_DEP; } + case 14: break; + case 1: + { break;/* ignore */ + } case 15: break; case 5: { return NUM; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (working copy) @@ -14,7 +14,7 @@ * limitations under the License. */ -// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC +// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 10:01:57 AM UTC // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros Index: modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */ +/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 6:02 AM */ package org.apache.lucene.analysis.wikipedia; @@ -25,8 +25,8 @@ /** * This class is a scanner generated by * JFlex 1.5.0-SNAPSHOT - * on 2/9/11 11:45 AM from the specification file - * C:/Users/rmuir/workspace/lucene-2911/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex + * on 9/30/11 6:02 AM from the specification file + * /lucene/reopenifneeded/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex */ class WikipediaTokenizerImpl { @@ -817,177 +817,177 @@ { positionInc = 1; return ACRONYM; } case 47: break; - case 8: - { /* ignore */ + case 24: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);break; } case 48: break; - case 20: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); - } - case 49: break; case 35: { positionInc = 1; return COMPANY; } - case 50: break; + case 49: break; case 4: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE); + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);break; } + case 50: break; + case 18: + { break;/* ignore STRING */ + } case 51: break; - case 25: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE); + case 31: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);break; } case 52: break; - case 43: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE); + case 34: + { positionInc = 1; return NUM; } case 53: break; - case 22: - { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;} + case 32: + { positionInc = 1; return APOSTROPHE; } case 54: break; - case 34: - { positionInc = 1; return NUM; + case 1: + { numWikiTokensSeen = 0; positionInc = 1; break; } case 55: break; - case 32: - { positionInc = 1; return APOSTROPHE; + case 21: + { yybegin(STRING); return currentTokType;/*pipe*/ } case 56: break; - case 23: - { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE); + case 44: + { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); break; } case 57: break; - case 21: - { yybegin(STRING); return currentTokType;/*pipe*/ + case 36: + { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);break; } case 58: break; case 2: { positionInc = 1; return ALPHANUM; } case 59: break; - case 29: - { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); - } - case 60: break; case 17: { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType; } + case 60: break; + case 43: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);break; + } case 61: break; - case 44: - { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); + case 41: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end bold italics*/ } case 62: break; - case 26: - { yybegin(YYINITIAL); - } - case 63: break; case 3: { positionInc = 1; return CJ; } + case 63: break; + case 10: + { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); break; + } case 64: break; - case 38: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/ + case 6: + { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType; } case 65: break; - case 15: - { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); + case 27: + { numLinkToks = 0; yybegin(YYINITIAL); break; } case 66: break; - case 30: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/ + case 8: + { break;/* ignore */ } case 67: break; - case 6: - { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType; + case 19: + { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/ } case 68: break; - case 5: - { positionInc = 1; + case 42: + { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType; } case 69: break; - case 19: - { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/ + case 13: + { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); break; } case 70: break; - case 42: - { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType; + case 15: + { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);break; } case 71: break; - case 27: - { numLinkToks = 0; yybegin(YYINITIAL); + case 5: + { positionInc = 1; break; } case 72: break; - case 11: - { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); + case 30: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end italics*/ } case 73: break; - case 13: - { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); + case 22: + { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}break; } case 74: break; + case 38: + { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end sub header*/ + } + case 75: break; case 14: { yybegin(STRING); numWikiTokensSeen++; return currentTokType; } - case 75: break; - case 45: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); - } case 76: break; - case 28: - { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); + case 20: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);break; } case 77: break; - case 37: - { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/ + case 26: + { yybegin(YYINITIAL);break; } case 78: break; - case 9: - { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType; + case 23: + { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);break; } case 79: break; - case 7: - { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType; + case 37: + { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);break;/*end bold*/ } case 80: break; - case 24: - { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE); + case 9: + { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType; } case 81: break; - case 40: - { positionInc = 1; return EMAIL; + case 7: + { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType; } case 82: break; - case 1: - { numWikiTokensSeen = 0; positionInc = 1; + case 25: + { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);break; } case 83: break; - case 18: - { /* ignore STRING */ + case 45: + { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);break; } case 84: break; - case 36: - { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); + case 40: + { positionInc = 1; return EMAIL; } case 85: break; - case 33: - { positionInc = 1; return HOST; + case 29: + { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); break; } case 86: break; - case 31: - { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); + case 11: + { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);break; } case 87: break; - case 41: - { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/ + case 33: + { positionInc = 1; return HOST; } case 88: break; + case 28: + { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); break; + } + case 89: break; case 12: { currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/ } - case 89: break; - case 10: - { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); - } case 90: break; default: if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { Index: modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (revision 1177458) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (working copy) @@ -192,108 +192,108 @@ //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent //tokens within the link are incremented - {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);} - {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}} - {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);} - {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);} - {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);} + {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);break;} + {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);break;} + {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);break;} + {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}break;} + {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);break;} + {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);break;} + {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);break;} //ignore - . | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; } + . | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; break;} } { //First {ALPHANUM} is always the link, set position to 0 for these //This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);} + {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); break;} //ignore - . | {WHITESPACE} { positionInc = 1; } + . | {WHITESPACE} { positionInc = 1; break;} } { //increment the link token, but then don't increment the tokens after that which are still in the link ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;} {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;} - "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);} - {WHITESPACE} { positionInc = 1; } + "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); break;} + {WHITESPACE} { positionInc = 1; break;} } { {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);} + {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);break;} //ignore - . | {WHITESPACE} { positionInc = 1; } + . | {WHITESPACE} { positionInc = 1; break;} } //italics { - "'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);} - "'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);} + "'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);break;} + "'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);break;} {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/} //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} + {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); break;} + {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); break;} + {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); break;} //ignore - . | {WHITESPACE} { /* ignore */ } + . | {WHITESPACE} { break;/* ignore */ } } //bold { {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;} //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} + {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); break;} + {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); break;} + {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); break;} //ignore - . | {WHITESPACE} { /* ignore */ } + . | {WHITESPACE} { break;/* ignore */ } } //bold italics { {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;} //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} + {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); break;} + {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); break;} + {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); break;} //ignore - . | {WHITESPACE} { /* ignore */ } + . | {WHITESPACE} { break;/* ignore */ } } { - "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);} + "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);break;} {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_EQUALS} {yybegin(YYINITIAL);} + {DOUBLE_EQUALS} {yybegin(YYINITIAL);break;} //ignore - . | {WHITESPACE} { /* ignore */ } + . | {WHITESPACE} { break;/* ignore */ } } { {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;} - {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);} - {CITATION_CLOSE} {yybegin(YYINITIAL);} + {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);break;} + {CITATION_CLOSE} {yybegin(YYINITIAL);break;} //ignore - . | {WHITESPACE} { /* ignore */ } + . | {WHITESPACE} { break;/* ignore */ } } { - "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/} - "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/} - "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/} - "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/} + "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end bold italics*/} + "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);break;/*end bold*/} + "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end italics*/} + "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);break;/*end sub header*/} {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/} //we can have links inside, let those override - {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);} + {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);break;} + {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);break;} + {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);break;} {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/} - .|{WHITESPACE} { /* ignore STRING */ } + .|{WHITESPACE} { break;/* ignore STRING */ } } @@ -315,7 +315,7 @@ //end wikipedia /** Ignore the rest */ -. | {WHITESPACE}|{TAGS} { /* ignore */ } +. | {WHITESPACE}|{TAGS} { break;/* ignore */ } //INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2} Index: modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java =================================================================== --- modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (revision 1177458) +++ modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (working copy) @@ -161,6 +161,7 @@ break; case GZIP: in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in); + break; case PLAIN: break; // nothing to do default: