Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java =================================================================== --- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (revision 598379) +++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (working copy) @@ -33,6 +33,12 @@ public class StandardAnalyzer extends Analyzer { private Set stopSet; + /** + * Specifies whether deprecated acronyms should be replaced with HOST type. + * @deprecated this should be removed in the next release (3.0). + */ + public boolean replaceDepAcronym = true; + /** An array containing some common English words that are usually not useful for searching. */ public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; @@ -70,6 +76,7 @@ StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); + ((StandardTokenizer) result).replaceDepAcronym = replaceDepAcronym; result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); @@ -79,7 +86,8 @@ private class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; - }; + } + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { @@ -89,9 +97,12 @@ streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); - } else + } else { streams.tokenStream.reset(reader); + } + streams.tokenStream.replaceDepAcronym = replaceDepAcronym; + return streams.filteredTokenStream; } } Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java =================================================================== --- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 598379) +++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy) @@ -43,6 +43,13 @@ public class StandardTokenizer extends Tokenizer { /** A private instance of the JFlex-constructed scanner */ private final StandardTokenizerImpl scanner; + + /** + * Specifies whether deprecated acronyms should be replaced with HOST type. + * @deprecated this should be removed in the next release (3.0). + */ + public boolean replaceDepAcronym = true; + void setInput(Reader reader) { this.input = reader; } @@ -72,7 +79,18 @@ final int start = scanner.yychar(); result.setStartOffset(start); result.setEndOffset(start+result.termLength()); - result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); + // This 'if' should be removed in the next release. For now, it converts + // invalid acronyms to HOST. When removed, only the 'else' part should + // remain. + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { + if (replaceDepAcronym) { + result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); + } else { + result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); + } + } else { + result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); + } return result; } Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java =================================================================== --- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 598379) +++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (working copy) @@ -1,4 +1,4 @@ -/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */ +/* The following code was generated by JFlex 1.4.1 on 11/29/07 3:16 PM */ package org.apache.lucene.analysis.standard; @@ -25,8 +25,8 @@ /** * This class is a scanner generated by * JFlex 1.4.1 - * on 8/9/07 10:15 AM from the specification file - * /tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex + * on 11/29/07 3:16 PM from the specification file + * D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex */ class StandardTokenizerImpl { @@ -63,12 +63,14 @@ private static final int [] ZZ_ACTION = zzUnpackAction(); private static final String ZZ_ACTION_PACKED_0 = - "\1\0\1\1\4\2\1\3\1\1\14\0\1\4\4\5"+ - "\2\6\2\0\1\7\1\0\1\7\3\5\6\7\3\5"+ - "\1\10\4\0\1\10\2\0\2\10\2\5\1\11"; + "\1\0\1\1\4\2\1\3\1\1\6\0\2\2\6\0"+ + "\1\4\4\5\2\6\2\0\1\7\1\0\1\7\2\0"+ + "\3\5\6\7\3\5\1\10\1\0\2\5\3\0\1\10"+ + "\2\0\2\5\2\10\1\5\2\11\1\5\1\12\2\11"+ + "\2\5"; private static int [] zzUnpackAction() { - int [] result = new int[57]; + int [] result = new int[71]; int offset = 0; offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); return result; @@ -96,14 +98,15 @@ "\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+ "\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+ "\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+ - "\0\u0159\0\207\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4\0\u01b3"+ + "\0\u0159\0\u0168\0\u0177\0\207\0\u0186\0\u0195\0\u01a4\0\u01b3"+ "\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+ "\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+ - "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\170\0\377\0\u02ee\0\u02fd"+ - "\0\u030c"; + "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\u02ee\0\u02fd\0\u030c\0\u031b"+ + "\0\u032a\0\u0339\0\u0348\0\u0357\0\170\0\u011d\0\u0366\0\u0375"+ + "\0\u0384\0\u0393\0\u03a2\0\u01e0\0\u01ef\0\u03b1\0\u03c0"; private static int [] zzUnpackRowMap() { - int [] result = new int[57]; + int [] result = new int[71]; int offset = 0; offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); return result; @@ -127,49 +130,62 @@ private static final String ZZ_TRANS_PACKED_0 = "\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+ - "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\3"+ - "\1\4\1\5\1\6\5\0\1\17\1\0\1\20\2\21"+ - "\1\22\3\4\1\6\4\0\1\11\1\23\1\13\1\14"+ - "\2\21\1\22\1\5\1\4\1\5\1\6\5\0\1\24"+ - "\1\0\1\20\2\15\1\16\4\6\21\0\1\2\10\0"+ - "\1\25\1\0\1\25\14\0\1\26\1\27\1\30\1\31"+ - "\13\0\1\32\1\0\1\32\14\0\1\33\1\34\1\33"+ - "\1\34\13\0\1\35\2\36\1\37\13\0\1\16\2\40"+ - "\14\0\1\41\2\42\1\43\13\0\4\34\13\0\1\44"+ - "\2\45\1\46\13\0\1\47\2\50\1\51\13\0\1\52"+ - "\1\42\1\53\1\43\13\0\1\54\2\27\1\31\4\0"+ - "\1\11\6\0\1\25\1\0\1\25\6\0\1\55\1\0"+ - "\1\20\2\56\1\0\1\26\1\27\1\30\1\31\5\0"+ - "\1\57\1\0\1\20\2\60\1\61\3\27\1\31\5\0"+ - "\1\62\1\0\1\20\2\60\1\61\1\30\1\27\1\30"+ - "\1\31\5\0\1\63\1\0\1\20\2\56\1\0\4\31"+ - "\5\0\1\64\2\0\1\64\2\0\1\33\1\34\1\33"+ - "\1\34\5\0\1\64\2\0\1\64\2\0\4\34\5\0"+ - "\1\56\1\0\1\20\2\56\1\0\1\35\2\36\1\37"+ - "\5\0\1\60\1\0\1\20\2\60\1\61\3\36\1\37"+ - "\5\0\1\56\1\0\1\20\2\56\1\0\4\37\5\0"+ - "\1\61\2\0\3\61\3\40\6\0\1\24\1\0\1\20"+ - "\2\15\1\16\1\41\2\42\1\43\5\0\1\17\1\0"+ - "\1\20\2\21\1\22\3\42\1\43\5\0\1\24\1\0"+ - "\1\20\2\15\1\16\4\43\5\0\1\15\1\0\1\20"+ - "\2\15\1\16\1\44\2\45\1\46\5\0\1\21\1\0"+ - "\1\20\2\21\1\22\3\45\1\46\5\0\1\15\1\0"+ - "\1\20\2\15\1\16\4\46\5\0\1\16\2\0\3\16"+ - "\1\47\2\50\1\51\5\0\1\22\2\0\3\22\3\50"+ - "\1\51\5\0\1\16\2\0\3\16\4\51\5\0\1\65"+ - "\1\0\1\20\2\15\1\16\1\52\1\42\1\53\1\43"+ - "\5\0\1\66\1\0\1\20\2\21\1\22\1\53\1\42"+ - "\1\53\1\43\5\0\1\63\1\0\1\20\2\56\1\0"+ - "\1\54\2\27\1\31\13\0\1\67\1\31\1\67\1\31"+ - "\13\0\4\37\13\0\4\43\13\0\4\46\13\0\4\51"+ - "\13\0\1\70\1\43\1\70\1\43\13\0\4\31\13\0"+ - "\4\71\5\0\1\55\1\0\1\20\2\56\1\0\1\67"+ - "\1\31\1\67\1\31\5\0\1\65\1\0\1\20\2\15"+ - "\1\16\1\70\1\43\1\70\1\43\5\0\1\64\2\0"+ - "\1\64\2\0\4\71\3\0"; + "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\17"+ + "\1\4\1\20\1\6\5\0\1\21\1\0\1\22\2\23"+ + "\1\24\3\4\1\6\4\0\1\11\1\25\1\13\1\14"+ + "\2\23\1\24\1\20\1\4\1\20\1\6\5\0\1\26"+ + "\1\0\1\22\2\15\1\16\4\6\21\0\1\2\10\0"+ + "\1\27\1\0\1\27\14\0\1\30\1\31\1\32\1\33"+ + "\13\0\1\34\1\0\1\34\14\0\1\35\1\36\1\35"+ + "\1\36\13\0\1\37\2\40\1\41\13\0\1\16\2\42"+ + "\5\0\1\11\1\43\1\13\1\14\2\15\1\16\1\17"+ + "\1\4\1\20\1\6\4\0\1\11\1\44\1\13\1\14"+ + "\2\23\1\24\1\20\1\4\1\20\1\6\13\0\1\45"+ + "\2\46\1\47\13\0\4\36\13\0\1\50\2\51\1\52"+ + "\13\0\1\53\2\54\1\55\13\0\1\56\1\46\1\57"+ + "\1\47\13\0\1\60\2\31\1\33\4\0\1\11\6\0"+ + "\1\27\1\0\1\27\6\0\1\61\1\0\1\22\2\62"+ + "\1\0\1\63\1\31\1\64\1\33\5\0\1\65\1\0"+ + "\1\22\2\66\1\67\3\31\1\33\5\0\1\70\1\0"+ + "\1\22\2\66\1\67\1\64\1\31\1\64\1\33\5\0"+ + "\1\71\1\0\1\22\2\62\1\0\4\33\5\0\1\72"+ + "\2\0\1\72\2\0\1\35\1\36\1\35\1\36\5\0"+ + "\1\72\2\0\1\72\2\0\4\36\5\0\1\62\1\0"+ + "\1\22\2\62\1\0\1\37\2\40\1\41\5\0\1\66"+ + "\1\0\1\22\2\66\1\67\3\40\1\41\5\0\1\62"+ + "\1\0\1\22\2\62\1\0\4\41\5\0\1\67\2\0"+ + "\3\67\3\42\14\0\1\63\1\31\1\64\1\33\13\0"+ + "\1\73\1\46\1\74\1\47\5\0\1\26\1\0\1\22"+ + "\2\15\1\16\1\45\2\46\1\47\5\0\1\21\1\0"+ + "\1\22\2\23\1\24\3\46\1\47\5\0\1\26\1\0"+ + "\1\22\2\15\1\16\4\47\5\0\1\15\1\0\1\22"+ + "\2\15\1\16\1\50\2\51\1\52\5\0\1\23\1\0"+ + "\1\22\2\23\1\24\3\51\1\52\5\0\1\15\1\0"+ + "\1\22\2\15\1\16\4\52\5\0\1\16\2\0\3\16"+ + "\1\53\2\54\1\55\5\0\1\24\2\0\3\24\3\54"+ + "\1\55\5\0\1\16\2\0\3\16\4\55\5\0\1\75"+ + "\1\0\1\22\2\15\1\16\1\73\1\46\1\74\1\47"+ + "\5\0\1\76\1\0\1\22\2\23\1\24\1\74\1\46"+ + "\1\74\1\47\5\0\1\71\1\0\1\22\2\62\1\0"+ + "\1\60\2\31\1\33\13\0\1\77\1\33\1\77\1\33"+ + "\13\0\4\41\5\0\1\100\1\0\1\22\2\62\1\0"+ + "\1\63\1\31\1\64\1\33\5\0\1\101\1\0\1\22"+ + "\2\66\1\67\1\64\1\31\1\64\1\33\13\0\4\47"+ + "\13\0\4\52\13\0\4\55\13\0\1\102\1\47\1\102"+ + "\1\47\13\0\4\33\13\0\4\103\5\0\1\104\1\0"+ + "\1\22\2\15\1\16\1\73\1\46\1\74\1\47\5\0"+ + "\1\105\1\0\1\22\2\23\1\24\1\74\1\46\1\74"+ + "\1\47\5\0\1\61\1\0\1\22\2\62\1\0\1\106"+ + "\1\33\1\106\1\33\13\0\1\106\1\33\1\106\1\33"+ + "\13\0\1\107\1\47\1\107\1\47\5\0\1\75\1\0"+ + "\1\22\2\15\1\16\1\107\1\47\1\107\1\47\5\0"+ + "\1\72\2\0\1\72\2\0\4\103\5\0\1\100\1\0"+ + "\1\22\2\62\1\0\1\106\1\33\1\106\1\33\5\0"+ + "\1\104\1\0\1\22\2\15\1\16\1\107\1\47\1\107"+ + "\1\47\3\0"; private static int [] zzUnpackTrans() { - int [] result = new int[795]; + int [] result = new int[975]; int offset = 0; offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); return result; @@ -207,11 +223,12 @@ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final String ZZ_ATTRIBUTE_PACKED_0 = - "\1\0\1\11\4\1\1\11\1\1\14\0\7\1\2\0"+ - "\1\1\1\0\16\1\4\0\1\1\2\0\5\1"; + "\1\0\1\11\4\1\1\11\1\1\6\0\2\1\6\0"+ + "\7\1\2\0\1\1\1\0\1\1\2\0\15\1\1\0"+ + "\2\1\3\0\1\1\2\0\15\1"; private static int [] zzUnpackAttribute() { - int [] result = new int[57]; + int [] result = new int[71]; int offset = 0; offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); return result; @@ -288,6 +305,12 @@ public static final int HOST = 5; public static final int NUM = 6; public static final int CJ = 7; +/** + * @deprecated this solves a bug where HOSTs that end with '.' are identified + * as ACRONYMs. It is deprecated and will be removed in the next + * release. + */ +public static final int ACRONYM_DEP = 8; public static final String [] TOKEN_TYPES = new String [] { "", @@ -297,7 +320,8 @@ "", "", "", - "" + "", + "" }; public final int yychar() @@ -605,39 +629,43 @@ case 5: { return HOST; } - case 10: break; + case 11: break; + case 9: + { return ACRONYM_DEP; + } + case 12: break; case 8: { return ACRONYM; } - case 11: break; + case 13: break; case 1: { /* ignore */ } - case 12: break; + case 14: break; case 7: { return NUM; } - case 13: break; + case 15: break; case 3: { return CJ; } - case 14: break; + case 16: break; case 2: { return ALPHANUM; } - case 15: break; + case 17: break; case 6: { return COMPANY; } - case 16: break; + case 18: break; case 4: { return APOSTROPHE; } - case 17: break; - case 9: + case 19: break; + case 10: { return EMAIL; } - case 18: break; + case 20: break; default: if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { zzAtEOF = true; Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex =================================================================== --- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 598379) +++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy) @@ -38,6 +38,12 @@ public static final int HOST = 5; public static final int NUM = 6; public static final int CJ = 7; +/** + * @deprecated this solves a bug where HOSTs that end with '.' are identified + * as ACRONYMs. It is deprecated and will be removed in the next + * release. + */ +public static final int ACRONYM_DEP = 8; public static final String [] TOKEN_TYPES = new String [] { "", @@ -47,7 +53,8 @@ "", "", "", - "" + "", + "" }; public final int yychar() @@ -72,7 +79,9 @@ // acronyms: U.S.A., I.B.M., etc. // use a post-filter to remove dots -ACRONYM = {ALPHA} "." ({ALPHA} ".")+ +ACRONYM = {LETTER} "." ({LETTER} ".")+ + +ACRONYM_DEP = {ALPHA} "." ({ALPHA} ".")+ // company names like AT&T and Excite@Home. COMPANY = {ALPHA} ("&"|"@") {ALPHA} @@ -125,6 +134,7 @@ {HOST} { return HOST; } {NUM} { return NUM; } {CJ} { return CJ; } +{ACRONYM_DEP} { return ACRONYM_DEP; } /** Ignore the rest */ . | {WHITESPACE} { /* ignore */ }