Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
===================================================================
--- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (revision 598379)
+++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (working copy)
@@ -33,6 +33,12 @@
public class StandardAnalyzer extends Analyzer {
private Set stopSet;
+ /**
+ * Specifies whether deprecated acronyms should be replaced with HOST type.
+ * @deprecated this should be removed in the next release (3.0).
+ */
+ public boolean replaceDepAcronym = true;
+
/** An array containing some common English words that are usually not
useful for searching. */
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
@@ -70,6 +76,7 @@
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
+ ((StandardTokenizer) result).replaceDepAcronym = replaceDepAcronym;
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
@@ -79,7 +86,8 @@
private class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
- };
+ }
+
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
@@ -89,9 +97,12 @@
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
- } else
+ } else {
streams.tokenStream.reset(reader);
+ }
+ streams.tokenStream.replaceDepAcronym = replaceDepAcronym;
+
return streams.filteredTokenStream;
}
}
Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 598379)
+++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
@@ -43,6 +43,13 @@
public class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private final StandardTokenizerImpl scanner;
+
+ /**
+ * Specifies whether deprecated acronyms should be replaced with HOST type.
+ * @deprecated this should be removed in the next release (3.0).
+ */
+ public boolean replaceDepAcronym = true;
+
void setInput(Reader reader) {
this.input = reader;
}
@@ -72,7 +79,18 @@
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start+result.termLength());
- result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+ if (replaceDepAcronym) {
+ result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+ } else {
+ result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+ }
+ } else {
+ result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ }
return result;
}
Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
===================================================================
--- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 598379)
+++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (working copy)
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
+/* The following code was generated by JFlex 1.4.1 on 11/29/07 3:16 PM */
package org.apache.lucene.analysis.standard;
@@ -25,8 +25,8 @@
/**
* This class is a scanner generated by
* JFlex 1.4.1
- * on 8/9/07 10:15 AM from the specification file
- * /tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+ * on 11/29/07 3:16 PM from the specification file
+ * D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
*/
class StandardTokenizerImpl {
@@ -63,12 +63,14 @@
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
- "\1\0\1\1\4\2\1\3\1\1\14\0\1\4\4\5"+
- "\2\6\2\0\1\7\1\0\1\7\3\5\6\7\3\5"+
- "\1\10\4\0\1\10\2\0\2\10\2\5\1\11";
+ "\1\0\1\1\4\2\1\3\1\1\6\0\2\2\6\0"+
+ "\1\4\4\5\2\6\2\0\1\7\1\0\1\7\2\0"+
+ "\3\5\6\7\3\5\1\10\1\0\2\5\3\0\1\10"+
+ "\2\0\2\5\2\10\1\5\2\11\1\5\1\12\2\11"+
+ "\2\5";
private static int [] zzUnpackAction() {
- int [] result = new int[57];
+ int [] result = new int[71];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@@ -96,14 +98,15 @@
"\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+
"\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
"\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+
- "\0\u0159\0\207\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4\0\u01b3"+
+ "\0\u0159\0\u0168\0\u0177\0\207\0\u0186\0\u0195\0\u01a4\0\u01b3"+
"\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+
"\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+
- "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\170\0\377\0\u02ee\0\u02fd"+
- "\0\u030c";
+ "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\u02ee\0\u02fd\0\u030c\0\u031b"+
+ "\0\u032a\0\u0339\0\u0348\0\u0357\0\170\0\u011d\0\u0366\0\u0375"+
+ "\0\u0384\0\u0393\0\u03a2\0\u01e0\0\u01ef\0\u03b1\0\u03c0";
private static int [] zzUnpackRowMap() {
- int [] result = new int[57];
+ int [] result = new int[71];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@@ -127,49 +130,62 @@
private static final String ZZ_TRANS_PACKED_0 =
"\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+
- "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\3"+
- "\1\4\1\5\1\6\5\0\1\17\1\0\1\20\2\21"+
- "\1\22\3\4\1\6\4\0\1\11\1\23\1\13\1\14"+
- "\2\21\1\22\1\5\1\4\1\5\1\6\5\0\1\24"+
- "\1\0\1\20\2\15\1\16\4\6\21\0\1\2\10\0"+
- "\1\25\1\0\1\25\14\0\1\26\1\27\1\30\1\31"+
- "\13\0\1\32\1\0\1\32\14\0\1\33\1\34\1\33"+
- "\1\34\13\0\1\35\2\36\1\37\13\0\1\16\2\40"+
- "\14\0\1\41\2\42\1\43\13\0\4\34\13\0\1\44"+
- "\2\45\1\46\13\0\1\47\2\50\1\51\13\0\1\52"+
- "\1\42\1\53\1\43\13\0\1\54\2\27\1\31\4\0"+
- "\1\11\6\0\1\25\1\0\1\25\6\0\1\55\1\0"+
- "\1\20\2\56\1\0\1\26\1\27\1\30\1\31\5\0"+
- "\1\57\1\0\1\20\2\60\1\61\3\27\1\31\5\0"+
- "\1\62\1\0\1\20\2\60\1\61\1\30\1\27\1\30"+
- "\1\31\5\0\1\63\1\0\1\20\2\56\1\0\4\31"+
- "\5\0\1\64\2\0\1\64\2\0\1\33\1\34\1\33"+
- "\1\34\5\0\1\64\2\0\1\64\2\0\4\34\5\0"+
- "\1\56\1\0\1\20\2\56\1\0\1\35\2\36\1\37"+
- "\5\0\1\60\1\0\1\20\2\60\1\61\3\36\1\37"+
- "\5\0\1\56\1\0\1\20\2\56\1\0\4\37\5\0"+
- "\1\61\2\0\3\61\3\40\6\0\1\24\1\0\1\20"+
- "\2\15\1\16\1\41\2\42\1\43\5\0\1\17\1\0"+
- "\1\20\2\21\1\22\3\42\1\43\5\0\1\24\1\0"+
- "\1\20\2\15\1\16\4\43\5\0\1\15\1\0\1\20"+
- "\2\15\1\16\1\44\2\45\1\46\5\0\1\21\1\0"+
- "\1\20\2\21\1\22\3\45\1\46\5\0\1\15\1\0"+
- "\1\20\2\15\1\16\4\46\5\0\1\16\2\0\3\16"+
- "\1\47\2\50\1\51\5\0\1\22\2\0\3\22\3\50"+
- "\1\51\5\0\1\16\2\0\3\16\4\51\5\0\1\65"+
- "\1\0\1\20\2\15\1\16\1\52\1\42\1\53\1\43"+
- "\5\0\1\66\1\0\1\20\2\21\1\22\1\53\1\42"+
- "\1\53\1\43\5\0\1\63\1\0\1\20\2\56\1\0"+
- "\1\54\2\27\1\31\13\0\1\67\1\31\1\67\1\31"+
- "\13\0\4\37\13\0\4\43\13\0\4\46\13\0\4\51"+
- "\13\0\1\70\1\43\1\70\1\43\13\0\4\31\13\0"+
- "\4\71\5\0\1\55\1\0\1\20\2\56\1\0\1\67"+
- "\1\31\1\67\1\31\5\0\1\65\1\0\1\20\2\15"+
- "\1\16\1\70\1\43\1\70\1\43\5\0\1\64\2\0"+
- "\1\64\2\0\4\71\3\0";
+ "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\17"+
+ "\1\4\1\20\1\6\5\0\1\21\1\0\1\22\2\23"+
+ "\1\24\3\4\1\6\4\0\1\11\1\25\1\13\1\14"+
+ "\2\23\1\24\1\20\1\4\1\20\1\6\5\0\1\26"+
+ "\1\0\1\22\2\15\1\16\4\6\21\0\1\2\10\0"+
+ "\1\27\1\0\1\27\14\0\1\30\1\31\1\32\1\33"+
+ "\13\0\1\34\1\0\1\34\14\0\1\35\1\36\1\35"+
+ "\1\36\13\0\1\37\2\40\1\41\13\0\1\16\2\42"+
+ "\5\0\1\11\1\43\1\13\1\14\2\15\1\16\1\17"+
+ "\1\4\1\20\1\6\4\0\1\11\1\44\1\13\1\14"+
+ "\2\23\1\24\1\20\1\4\1\20\1\6\13\0\1\45"+
+ "\2\46\1\47\13\0\4\36\13\0\1\50\2\51\1\52"+
+ "\13\0\1\53\2\54\1\55\13\0\1\56\1\46\1\57"+
+ "\1\47\13\0\1\60\2\31\1\33\4\0\1\11\6\0"+
+ "\1\27\1\0\1\27\6\0\1\61\1\0\1\22\2\62"+
+ "\1\0\1\63\1\31\1\64\1\33\5\0\1\65\1\0"+
+ "\1\22\2\66\1\67\3\31\1\33\5\0\1\70\1\0"+
+ "\1\22\2\66\1\67\1\64\1\31\1\64\1\33\5\0"+
+ "\1\71\1\0\1\22\2\62\1\0\4\33\5\0\1\72"+
+ "\2\0\1\72\2\0\1\35\1\36\1\35\1\36\5\0"+
+ "\1\72\2\0\1\72\2\0\4\36\5\0\1\62\1\0"+
+ "\1\22\2\62\1\0\1\37\2\40\1\41\5\0\1\66"+
+ "\1\0\1\22\2\66\1\67\3\40\1\41\5\0\1\62"+
+ "\1\0\1\22\2\62\1\0\4\41\5\0\1\67\2\0"+
+ "\3\67\3\42\14\0\1\63\1\31\1\64\1\33\13\0"+
+ "\1\73\1\46\1\74\1\47\5\0\1\26\1\0\1\22"+
+ "\2\15\1\16\1\45\2\46\1\47\5\0\1\21\1\0"+
+ "\1\22\2\23\1\24\3\46\1\47\5\0\1\26\1\0"+
+ "\1\22\2\15\1\16\4\47\5\0\1\15\1\0\1\22"+
+ "\2\15\1\16\1\50\2\51\1\52\5\0\1\23\1\0"+
+ "\1\22\2\23\1\24\3\51\1\52\5\0\1\15\1\0"+
+ "\1\22\2\15\1\16\4\52\5\0\1\16\2\0\3\16"+
+ "\1\53\2\54\1\55\5\0\1\24\2\0\3\24\3\54"+
+ "\1\55\5\0\1\16\2\0\3\16\4\55\5\0\1\75"+
+ "\1\0\1\22\2\15\1\16\1\73\1\46\1\74\1\47"+
+ "\5\0\1\76\1\0\1\22\2\23\1\24\1\74\1\46"+
+ "\1\74\1\47\5\0\1\71\1\0\1\22\2\62\1\0"+
+ "\1\60\2\31\1\33\13\0\1\77\1\33\1\77\1\33"+
+ "\13\0\4\41\5\0\1\100\1\0\1\22\2\62\1\0"+
+ "\1\63\1\31\1\64\1\33\5\0\1\101\1\0\1\22"+
+ "\2\66\1\67\1\64\1\31\1\64\1\33\13\0\4\47"+
+ "\13\0\4\52\13\0\4\55\13\0\1\102\1\47\1\102"+
+ "\1\47\13\0\4\33\13\0\4\103\5\0\1\104\1\0"+
+ "\1\22\2\15\1\16\1\73\1\46\1\74\1\47\5\0"+
+ "\1\105\1\0\1\22\2\23\1\24\1\74\1\46\1\74"+
+ "\1\47\5\0\1\61\1\0\1\22\2\62\1\0\1\106"+
+ "\1\33\1\106\1\33\13\0\1\106\1\33\1\106\1\33"+
+ "\13\0\1\107\1\47\1\107\1\47\5\0\1\75\1\0"+
+ "\1\22\2\15\1\16\1\107\1\47\1\107\1\47\5\0"+
+ "\1\72\2\0\1\72\2\0\4\103\5\0\1\100\1\0"+
+ "\1\22\2\62\1\0\1\106\1\33\1\106\1\33\5\0"+
+ "\1\104\1\0\1\22\2\15\1\16\1\107\1\47\1\107"+
+ "\1\47\3\0";
private static int [] zzUnpackTrans() {
- int [] result = new int[795];
+ int [] result = new int[975];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@@ -207,11 +223,12 @@
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
- "\1\0\1\11\4\1\1\11\1\1\14\0\7\1\2\0"+
- "\1\1\1\0\16\1\4\0\1\1\2\0\5\1";
+ "\1\0\1\11\4\1\1\11\1\1\6\0\2\1\6\0"+
+ "\7\1\2\0\1\1\1\0\1\1\2\0\15\1\1\0"+
+ "\2\1\3\0\1\1\2\0\15\1";
private static int [] zzUnpackAttribute() {
- int [] result = new int[57];
+ int [] result = new int[71];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@@ -288,6 +305,12 @@
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs. It is deprecated and will be removed in the next
+ * release.
+ */
+public static final int ACRONYM_DEP = 8;
public static final String [] TOKEN_TYPES = new String [] {
"",
@@ -297,7 +320,8 @@
"",
"",
"",
- ""
+ "",
+ ""
};
public final int yychar()
@@ -605,39 +629,43 @@
case 5:
{ return HOST;
}
- case 10: break;
+ case 11: break;
+ case 9:
+ { return ACRONYM_DEP;
+ }
+ case 12: break;
case 8:
{ return ACRONYM;
}
- case 11: break;
+ case 13: break;
case 1:
{ /* ignore */
}
- case 12: break;
+ case 14: break;
case 7:
{ return NUM;
}
- case 13: break;
+ case 15: break;
case 3:
{ return CJ;
}
- case 14: break;
+ case 16: break;
case 2:
{ return ALPHANUM;
}
- case 15: break;
+ case 17: break;
case 6:
{ return COMPANY;
}
- case 16: break;
+ case 18: break;
case 4:
{ return APOSTROPHE;
}
- case 17: break;
- case 9:
+ case 19: break;
+ case 10:
{ return EMAIL;
}
- case 18: break;
+ case 20: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
Index: D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
===================================================================
--- D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 598379)
+++ D:/dev/lucene/lucene-trunk/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy)
@@ -38,6 +38,12 @@
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs. It is deprecated and will be removed in the next
+ * release.
+ */
+public static final int ACRONYM_DEP = 8;
public static final String [] TOKEN_TYPES = new String [] {
"",
@@ -47,7 +53,8 @@
"",
"",
"",
- ""
+ "",
+ ""
};
public final int yychar()
@@ -72,7 +79,9 @@
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
-ACRONYM = {ALPHA} "." ({ALPHA} ".")+
+ACRONYM = {LETTER} "." ({LETTER} ".")+
+
+ACRONYM_DEP = {ALPHA} "." ({ALPHA} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
@@ -125,6 +134,7 @@
{HOST} { return HOST; }
{NUM} { return NUM; }
{CJ} { return CJ; }
+{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
. | {WHITESPACE} { /* ignore */ }