--- StandardTokenizer.jj.old 2005-11-08 15:44:44.000000000 +0900 +++ StandardTokenizer.jj 2005-11-08 15:10:24.000000000 +0900 @@ -1,4 +1,4 @@ -/** +/**f * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -59,7 +59,7 @@ TOKEN : { // token patterns // basic word: a sequence of digits & letters - |)+ > + ||)+ > // internal apostrophes: O'Reilly, you're, O'Reilly's // use a post-filter to remove possesives @@ -106,16 +106,20 @@ "\u0100"-"\u1fff" ] > -| < CJK: // non-alphabets +| < CJ: // Chinese, Japanese [ "\u3040"-"\u318f", "\u3300"-"\u337f", "\u3400"-"\u3d2d", "\u4e00"-"\u9fff", - "\uac00"-"\ud7af", "\uf900"-"\ufaff" ] > +| < KOREAN: // Korean + [ + "\uac00"-"\ud7af" + ] + > | < #DIGIT: // unicode digits [ "\u0030"-"\u0039", @@ -157,7 +161,7 @@ token = | token = | token = | - token = | + token = | token = ) {