Index: StandardTokenizer.jj =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj,v retrieving revision 1.3 diff -u -r1.3 StandardTokenizer.jj --- StandardTokenizer.jj 5 Jun 2002 04:54:47 -0000 1.3 +++ StandardTokenizer.jj 30 Sep 2003 16:17:29 -0000 @@ -56,7 +56,7 @@ STATIC = false; //IGNORE_CASE = true; //BUILD_PARSER = false; -//UNICODE_INPUT = true; + UNICODE_INPUT = true; USER_CHAR_STREAM = true; OPTIMIZE_TOKEN_MANAGER = true; //DEBUG_TOKEN_MANAGER = true; @@ -125,6 +125,7 @@ (|)* > +| < SIGRAM: ()+ > | < #ALPHA: ()+> | < #LETTER: // unicode letters [ @@ -133,7 +134,11 @@ "\u00c0"-"\u00d6", "\u00d8"-"\u00f6", "\u00f8"-"\u00ff", - "\u0100"-"\u1fff", + "\u0100"-"\u1fff" + ] + > +| < #CJK: // non-alphabets + [ "\u3040"-"\u318f", "\u3300"-"\u337f", "\u3400"-"\u3d2d", @@ -182,6 +187,7 @@ token = | token = | token = | + token = | token = ) {