Index: lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java
===================================================================
--- lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java	(revision 0)
+++ lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java	(revision 0)
@@ -0,0 +1,186 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
+  
+  public void testHugeDoc() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    char whitespace[] = new char[4094];
+    Arrays.fill(whitespace, ' ');
+    sb.append(whitespace);
+    sb.append("testing 1234");
+    String input = sb.toString();
+    UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
+    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+  }
+
+  private Analyzer a = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents
+      (String fieldName, Reader reader) {
+
+      Tokenizer tokenizer = new UAX29Tokenizer(reader);
+      return new TokenStreamComponents(tokenizer);
+    }
+  };
+
+  public void testArmenian() throws Exception {
+    assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
+        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
+  }
+  
+  public void testAmharic() throws Exception {
+    assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
+        new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
+  }
+  
+  public void testArabic() throws Exception {
+    assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
+        new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); 
+  }
+  
+  public void testAramaic() throws Exception {
+    assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
+  }
+  
+  public void testBengali() throws Exception {
+    assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
+        new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
+        "শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
+  }
+  
+  public void testFarsi() throws Exception {
+    assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
+        new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
+        "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
+  }
+  
+  public void testGreek() throws Exception {
+    assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
+        new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
+  }
+
+  public void testTibetan() throws Exception {
+    assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+                     new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", 
+                                    "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", 
+                                    "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+  }
+  
+  /*
+   * For chinese, tokenize as char (these can later form bigrams or whatever)
+   * TODO: why do full-width numerics have no word-break prop?
+   */
+  public void testChinese() throws Exception {
+    assertAnalyzesTo(a, "我是中国人。 １２３４ Ｔｅｓｔｓ ",
+        new String[] { "我", "是", "中", "国", "人", "Ｔｅｓｔｓ"});
+  }
+  
+  public void testEmpty() throws Exception {
+    assertAnalyzesTo(a, "", new String[] {});
+    assertAnalyzesTo(a, ".", new String[] {});
+    assertAnalyzesTo(a, " ", new String[] {});
+  }
+  
+  /* test various jira issues this analyzer is related to */
+  
+  public void testLUCENE1545() throws Exception {
+    /*
+     * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
+     * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
+     * Expected result is only on token "moͤchte".
+     */
+    assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
+  }
+  
+  /* Tests from StandardAnalyzer, just to show behavior is similar */
+  public void testAlphanumericSA() throws Exception {
+    // alphanumeric tokens
+    assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
+    assertAnalyzesTo(a, "2B", new String[]{"2B"});
+  }
+
+  public void testDelimitersSA() throws Exception {
+    // other delimiters: "-", "/", ","
+    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+  }
+
+  public void testApostrophesSA() throws Exception {
+    // internal apostrophes: O'Reilly, you're, O'Reilly's
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
+    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+  }
+
+  public void testNumericSA() throws Exception {
+    // floating point, serial, model numbers, ip addresses, etc.
+    // every other segment must have at least one digit
+    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+  }
+
+  public void testTextWithNumbersSA() throws Exception {
+    // numbers
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+  }
+
+  public void testVariousTextSA() throws Exception {
+    // various
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+  }
+
+  public void testKoreanSA() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+  }
+  
+  public void testOffsets() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"David", "has", "5000", "bones"},
+        new int[] {0, 6, 10, 15},
+        new int[] {5, 9, 14, 20});
+  }
+  
+  public void testTypes() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"David", "has", "5000", "bones"},
+        new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
+  }}
Index: lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java	(revision 0)
+++ lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java	(revision 0)
@@ -0,0 +1,787 @@
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/9/10 12:24 AM */
+
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+
+/**
+ * This class implements the Unicode Text Segmentation algorithm, as specified
+ * in Unicode Standard Annex #29 <http://unicode.org/reports/tr29/>.
+ */
+
+public class UAX29Tokenizer extends Tokenizer {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = { 
+     0, 0
+  };
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED = 
+    "\47\0\1\7\4\0\1\6\1\0\1\7\1\0\12\3\1\5\1\6"+
+    "\5\0\32\1\4\0\1\10\1\0\32\1\57\0\1\1\2\0\1\2"+
+    "\7\0\1\1\1\0\1\5\2\0\1\1\5\0\27\1\1\0\37\1"+
+    "\1\0\u01ca\1\4\0\14\1\16\0\5\1\7\0\1\1\1\0\1\1"+
+    "\21\0\160\2\5\1\1\0\2\1\2\0\4\1\1\6\7\0\1\1"+
+    "\1\5\3\1\1\0\1\1\1\0\24\1\1\0\123\1\1\0\213\1"+
+    "\1\0\7\2\234\1\13\0\46\1\2\0\1\1\7\0\47\1\1\0"+
+    "\1\6\7\0\55\2\1\0\1\2\1\0\2\2\1\0\2\2\1\0"+
+    "\1\2\10\0\33\1\5\0\4\1\1\5\13\0\4\2\10\0\2\6"+
+    "\2\0\13\2\6\0\52\1\24\2\1\0\12\3\1\0\1\3\1\6"+
+    "\1\0\2\1\1\2\143\1\1\0\1\1\17\2\2\1\2\2\1\0"+
+    "\4\2\2\1\12\3\3\1\2\0\1\1\17\0\1\2\1\1\1\2"+
+    "\36\1\33\2\2\0\131\1\13\2\1\1\16\0\12\3\41\1\11\2"+
+    "\2\1\2\0\1\6\1\0\1\1\5\0\26\1\4\2\1\1\11\2"+
+    "\1\1\3\2\1\1\5\2\322\0\4\2\66\1\2\0\1\2\1\1"+
+    "\21\2\1\0\1\1\5\2\2\0\12\1\2\2\2\0\12\3\1\0"+
+    "\2\1\6\0\7\1\1\0\3\2\1\0\10\1\2\0\2\1\2\0"+
+    "\26\1\1\0\7\1\1\0\1\1\3\0\4\1\2\0\1\2\1\1"+
+    "\7\2\2\0\2\2\2\0\3\2\1\1\10\0\1\2\4\0\2\1"+
+    "\1\0\3\1\2\2\2\0\12\3\2\1\17\0\3\2\1\0\6\1"+
+    "\4\0\2\1\2\0\26\1\1\0\7\1\1\0\2\1\1\0\2\1"+
+    "\1\0\2\1\2\0\1\2\1\0\5\2\4\0\2\2\2\0\3\2"+
+    "\3\0\1\2\7\0\4\1\1\0\1\1\7\0\12\3\2\2\3\1"+
+    "\1\2\13\0\3\2\1\0\11\1\1\0\3\1\1\0\26\1\1\0"+
+    "\7\1\1\0\2\1\1\0\5\1\2\0\1\2\1\1\10\2\1\0"+
+    "\3\2\1\0\3\2\2\0\1\1\17\0\2\1\2\2\2\0\12\3"+
+    "\21\0\3\2\1\0\10\1\2\0\2\1\2\0\26\1\1\0\7\1"+
+    "\1\0\2\1\1\0\5\1\2\0\1\2\1\1\7\2\2\0\2\2"+
+    "\2\0\3\2\10\0\2\2\4\0\2\1\1\0\3\1\2\2\2\0"+
+    "\12\3\1\0\1\1\20\0\1\2\1\1\1\0\6\1\3\0\3\1"+
+    "\1\0\4\1\3\0\2\1\1\0\1\1\1\0\2\1\3\0\2\1"+
+    "\3\0\3\1\3\0\14\1\4\0\5\2\3\0\3\2\1\0\4\2"+
+    "\2\0\1\1\6\0\1\2\16\0\12\3\21\0\3\2\1\0\10\1"+
+    "\1\0\3\1\1\0\27\1\1\0\12\1\1\0\5\1\3\0\1\1"+
+    "\7\2\1\0\3\2\1\0\4\2\7\0\2\2\1\0\2\1\6\0"+
+    "\2\1\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1"+
+    "\1\0\27\1\1\0\12\1\1\0\5\1\2\0\1\2\1\1\7\2"+
+    "\1\0\3\2\1\0\4\2\7\0\2\2\7\0\1\1\1\0\2\1"+
+    "\2\2\2\0\12\3\22\0\2\2\1\0\10\1\1\0\3\1\1\0"+
+    "\27\1\1\0\20\1\3\0\1\1\7\2\1\0\3\2\1\0\4\2"+
+    "\11\0\1\2\10\0\2\1\2\2\2\0\12\3\12\0\6\1\2\0"+
+    "\2\2\1\0\22\1\3\0\30\1\1\0\11\1\1\0\1\1\2\0"+
+    "\7\1\3\0\1\2\4\0\6\2\1\0\1\2\1\0\10\2\22\0"+
+    "\2\2\75\0\1\2\2\0\7\2\14\0\10\2\1\0\12\3\127\0"+
+    "\1\2\2\0\6\2\1\0\2\2\13\0\6\2\2\0\12\3\46\0"+
+    "\1\1\27\0\2\2\6\0\12\3\13\0\1\2\1\0\1\2\1\0"+
+    "\1\2\4\0\2\2\10\1\1\0\44\1\4\0\24\2\1\0\2\2"+
+    "\4\1\4\0\10\2\1\0\44\2\11\0\1\2\144\0\24\2\1\0"+
+    "\12\3\14\0\4\2\4\0\3\2\1\0\3\2\2\0\7\2\3\0"+
+    "\4\2\15\0\14\2\1\0\1\2\12\3\4\2\2\0\46\1\12\0"+
+    "\53\1\1\0\1\1\3\0\u0149\1\1\0\4\1\2\0\7\1\1\0"+
+    "\1\1\1\0\4\1\2\0\51\1\1\0\4\1\2\0\41\1\1\0"+
+    "\4\1\2\0\7\1\1\0\1\1\1\0\4\1\2\0\17\1\1\0"+
+    "\71\1\1\0\4\1\2\0\103\1\4\0\1\2\40\0\20\1\20\0"+
+    "\125\1\14\0\u026c\1\2\0\21\1\1\0\32\1\5\0\113\1\3\0"+
+    "\3\1\17\0\15\1\1\0\4\1\3\2\13\0\22\1\3\2\13\0"+
+    "\22\1\2\2\14\0\15\1\1\0\3\1\1\0\2\2\100\0\40\2"+
+    "\11\0\1\2\2\0\12\3\41\0\3\2\2\0\12\3\6\0\130\1"+
+    "\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
+    "\4\0\14\2\12\0\12\3\140\0\21\2\7\0\2\2\6\0\13\3"+
+    "\45\0\27\1\5\2\71\0\12\2\1\0\35\2\2\0\1\2\12\3"+
+    "\6\0\12\3\146\0\5\2\57\1\21\2\7\1\4\0\12\3\21\0"+
+    "\11\2\14\0\3\2\36\1\12\2\3\0\2\1\12\3\106\0\44\1"+
+    "\24\2\10\0\12\3\3\0\3\1\12\3\44\1\122\0\3\2\1\0"+
+    "\25\2\4\1\1\2\4\1\1\2\15\0\300\1\47\2\26\0\3\2"+
+    "\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1\1\0"+
+    "\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1\1\0"+
+    "\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1\2\0"+
+    "\6\1\4\0\15\1\5\0\3\1\1\0\7\1\17\0\4\2\10\0"+
+    "\2\7\12\0\1\7\2\0\1\5\2\0\5\2\20\0\2\10\3\0"+
+    "\1\6\17\0\1\10\13\0\5\2\5\0\6\2\1\0\1\1\15\0"+
+    "\1\1\20\0\5\1\73\0\41\2\21\0\1\1\4\0\1\1\2\0"+
+    "\12\1\1\0\1\1\3\0\5\1\6\0\1\1\1\0\1\1\1\0"+
+    "\1\1\1\0\4\1\1\0\13\1\2\0\4\1\5\0\5\1\4\0"+
+    "\1\1\21\0\51\1\u032d\0\64\1\u0716\0\57\1\1\0\57\1\1\0"+
+    "\205\1\6\0\4\1\3\2\16\0\46\1\12\0\66\1\11\0\1\1"+
+    "\20\0\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
+    "\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2"+
+    "\57\0\1\1\120\0\32\11\1\0\131\11\14\0\326\11\57\0\1\1"+
+    "\1\0\1\11\31\0\11\11\6\2\1\0\5\4\2\0\3\11\1\1"+
+    "\1\1\4\0\126\11\2\0\2\2\2\4\3\11\133\4\1\0\4\4"+
+    "\5\0\51\1\3\0\136\1\21\0\30\1\70\0\20\4\320\0\57\4"+
+    "\1\0\130\4\250\0\u19b6\11\112\0\u51cc\11\64\0\u048d\1\103\0\56\1"+
+    "\2\0\u010d\1\3\0\20\1\12\3\2\1\24\0\40\1\2\0\15\1"+
+    "\4\2\11\0\2\2\1\0\31\1\10\0\120\1\2\2\45\0\11\1"+
+    "\2\0\147\1\2\0\2\1\156\0\7\1\1\2\3\1\1\2\4\1"+
+    "\1\2\27\1\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0"+
+    "\12\3\6\0\22\2\6\1\3\0\1\1\4\0\12\3\34\1\10\2"+
+    "\2\0\27\1\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0"+
+    "\1\1\12\3\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2"+
+    "\2\0\12\3\41\0\1\2\64\0\1\2\1\0\3\2\2\0\2\2"+
+    "\5\0\2\2\1\0\1\2\376\0\43\1\10\2\1\0\2\2\2\0"+
+    "\12\3\6\0\u2ba4\1\14\0\27\1\4\0\61\1\u2104\0\u012e\11\2\0"+
+    "\76\11\2\0\152\11\46\0\7\1\14\0\5\1\5\0\1\1\1\2"+
+    "\12\1\1\0\15\1\1\0\5\1\1\0\1\1\1\0\2\1\1\0"+
+    "\2\1\1\0\154\1\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0"+
+    "\14\1\4\0\20\2\1\6\2\0\1\5\1\6\13\0\7\2\14\0"+
+    "\2\10\30\0\3\10\1\6\1\0\1\7\1\0\1\6\1\5\32\0"+
+    "\5\1\1\0\207\1\2\0\1\2\7\0\1\7\4\0\1\6\1\0"+
+    "\1\7\13\0\1\5\1\6\5\0\32\1\4\0\1\10\1\0\32\1"+
+    "\13\0\70\4\2\2\37\1\3\0\6\1\2\0\6\1\2\0\6\1"+
+    "\2\0\3\1\34\0\3\2\4\0";
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /** 
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\1\1\2\1\1\1\3\4\1\1\0\1\1\1\0"+
+    "\1\3\1\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[13];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /** 
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\12\0\24\0\36\0\50\0\62\0\12\0\74"+
+    "\0\106\0\120\0\132\0\144\0\156";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[13];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /** 
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\2\1\3\1\2\1\4\1\5\3\2\1\6\1\7"+
+    "\13\0\2\3\1\10\1\0\1\11\1\0\1\11\1\12"+
+    "\2\0\1\3\2\4\2\0\2\13\1\14\3\0\1\5"+
+    "\1\0\1\5\3\0\1\12\2\0\1\3\1\6\1\4"+
+    "\1\5\3\0\1\6\2\0\1\3\2\10\2\0\2\15"+
+    "\1\12\2\0\1\3\1\11\10\0\1\3\1\12\1\10"+
+    "\1\5\3\0\1\12\3\0\1\13\1\4\7\0\1\3"+
+    "\1\14\1\4\1\5\3\0\1\14\3\0\1\15\1\10"+
+    "\6\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[120];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\1\1\11\4\1\1\11\1\1\1\0\1\1\1\0"+
+    "\1\1\1\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[13];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+  public static final String WORD_TYPE = "<WORD>";
+  public static final String NUMERIC_TYPE = "<NUM>";
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt 
+    = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+  private int posIncr;
+
+  
+  /**
+   * @param source The AttributeSource to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    zzReader = input;
+  }
+  
+  /**
+   * @param factory The AttributeFactory to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input); 
+    zzReader = input;
+  }
+  
+  @Override
+  public final void end() {
+    // set final offset
+    int finalOffset = correctOffset(yychar + yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset(Reader reader) throws IOException {
+    super.reset(reader);
+    if (zzBuffer.length > ZZ_BUFFERSIZE) {
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+    }
+    yyreset(reader);
+  }
+
+  /**
+   * Populates this TokenStream's CharTermAttribute, PositionIncrementAttribute, 
+   * OffsetAttribute, and TypeAttribute from the current match.
+   * 
+   * <p/> 
+   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
+   * and false is returned.
+   * 
+   * @param tokenType The type of the matching token
+   * @return true there is a token available; false otherwise 
+   */
+  private boolean populateAttributes(String tokenType) {
+    boolean isTokenAvailable = false;
+    if (yylength() > maxTokenLength) {
+      // When we skip a too-long token, we still increment the position
+      // increment, but only if the token would have otherwise been output
+      // (that is, only if it contains Letters and/or Numbers.)
+      ++posIncr;
+    } else {
+      termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
+      posIncrAtt.setPositionIncrement(posIncr);
+      offsetAtt.setOffset(correctOffset(yychar), 
+                          correctOffset(yychar + yylength()));
+      typeAtt.setType(tokenType);
+      isTokenAvailable = true;
+    }
+    return isTokenAvailable;
+  }
+
+  /** 
+   * Set the max allowed token length.  Any token longer than this is skipped.
+   * @param length the new max allowed token length
+   */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /**
+   * Returns the max allowed token length.  Any token longer than this is 
+   * skipped.
+   * @return the max allowed token length 
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    // This method is required because of two JFlex limitations:
+    // 1. No way to insert code at the beginning of the generated scanning
+    //    get-next-token method; and
+    // 2. No way to declare @Override on the generated scanning method.
+    clearAttributes();
+    posIncr = 1;
+    return getNextToken();
+  }
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public UAX29Tokenizer(java.io.Reader in) {
+    super(in);
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public UAX29Tokenizer(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /** 
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 2030) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream    
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }     
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public boolean getNextToken() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      yychar+= zzMarkedPosL-zzStartRead;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+    
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 3: 
+          { if (populateAttributes(NUMERIC_TYPE)) return true;
+          }
+        case 4: break;
+        case 2: 
+          { /* Not numeric, word, ideographic, or hiragana -- ignore it. */
+          }
+        case 5: break;
+        case 1: 
+          { if (populateAttributes(WORD_TYPE)) return true;
+          }
+        case 6: break;
+        default: 
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+              {
+                return false;
+              }
+          } 
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
Index: lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex	(revision 0)
+++ lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex	(revision 0)
@@ -0,0 +1,213 @@
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+
+/**
+ * This class implements the Unicode Text Segmentation algorithm, as specified
+ * in Unicode Standard Annex #29 <http://unicode.org/reports/tr29/>.
+ */
+%%
+
+%unicode 5.2
+%public
+%class UAX29Tokenizer
+%extends Tokenizer
+%type boolean
+%function getNextToken
+%char
+
+%init{
+  super(in);
+%init}
+
+// WB4. X (Extend | Format)* --> X
+//
+ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
+NumericEx      = \p{WB:Numeric}                     [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx     = \p{WB:Katakana}                    [\p{WB:Format}\p{WB:Extend}]* 
+
+MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* 
+MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
+
+ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
+
+%{
+  public static final String WORD_TYPE = "<WORD>";
+  public static final String NUMERIC_TYPE = "<NUM>";
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt 
+    = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+  private int posIncr;
+
+  
+  /**
+   * @param source The AttributeSource to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    zzReader = input;
+  }
+  
+  /**
+   * @param factory The AttributeFactory to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input); 
+    zzReader = input;
+  }
+  
+  @Override
+  public final void end() {
+    // set final offset
+    int finalOffset = correctOffset(yychar + yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset(Reader reader) throws IOException {
+    super.reset(reader);
+    if (zzBuffer.length > ZZ_BUFFERSIZE) {
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+    }
+    yyreset(reader);
+  }
+
+  /**
+   * Populates this TokenStream's CharTermAttribute, PositionIncrementAttribute, 
+   * OffsetAttribute, and TypeAttribute from the current match.
+   * 
+   * <p/> 
+   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
+   * and false is returned.
+   * 
+   * @param tokenType The type of the matching token
+   * @return true there is a token available; false otherwise 
+   */
+  private boolean populateAttributes(String tokenType) {
+    boolean isTokenAvailable = false;
+    if (yylength() > maxTokenLength) {
+      // When we skip a too-long token, we still increment the position
+      // increment, but only if the token would have otherwise been output
+      // (that is, only if it contains Letters and/or Numbers.)
+      ++posIncr;
+    } else {
+      termAtt.copyBuffer(zzBuffer, zzStartRead, yylength());
+      posIncrAtt.setPositionIncrement(posIncr);
+      offsetAtt.setOffset(correctOffset(yychar), 
+                          correctOffset(yychar + yylength()));
+      typeAtt.setType(tokenType);
+      isTokenAvailable = true;
+    }
+    return isTokenAvailable;
+  }
+
+  /** 
+   * Set the max allowed token length.  Any token longer than this is skipped.
+   * @param length the new max allowed token length
+   */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /**
+   * Returns the max allowed token length.  Any token longer than this is 
+   * skipped.
+   * @return the max allowed token length 
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    // This method is required because of two JFlex limitations:
+    // 1. No way to insert code at the beginning of the generated scanning
+    //    get-next-token method; and
+    // 2. No way to declare @Override on the generated scanning method.
+    clearAttributes();
+    posIncr = 1;
+    return getNextToken();
+  }
+%}
+
+%%
+
+// WB1. 	sot 	÷ 	
+// WB2. 		÷ 	eot
+//
+<<EOF>> { return false; }
+
+
+// WB5.   ALetter × ALetter
+// WB8.   Numeric × Numeric
+// WB11.  Numeric (MidNum | MidNumLet) × Numeric
+// WB12.  Numeric × (MidNum | MidNumLet) Numeric
+// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+//
+{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})* {ExtendNumLetEx}* 
+  { if (populateAttributes(NUMERIC_TYPE)) return true; }
+
+
+// WB6.   ALetter × (MidLetter | MidNumLet) ALetter
+// WB7.   ALetter (MidLetter | MidNumLet) × ALetter
+// WB9.   ALetter × Numeric
+// WB10.  Numeric × ALetter
+// WB13.  Katakana × Katakana
+// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+//
+{ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
+                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
+                     | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )* ) 
+({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
+                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
+                     | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )* ) )*
+{ExtendNumLetEx}*  
+  { if (populateAttributes(WORD_TYPE)) return true; }
+
+
+// WB14.  Any ÷ Any
+//
+[\p{Script:Han}\p{Script:Hiragana}] { if (populateAttributes(WORD_TYPE)) return true; }
+
+
+// WB3.   CR × LF
+// WB3a.  (Newline | CR | LF) ÷
+// WB3b.  ÷ (Newline | CR | LF)
+//
+[^] { /* Not numeric, word, ideographic, or hiragana -- ignore it. */ }
Index: lucene/build.xml
===================================================================
--- lucene/build.xml	(revision 942420)
+++ lucene/build.xml	(working copy)
@@ -661,6 +661,9 @@
     <jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex"
            outdir="src/java/org/apache/lucene/analysis/standard"
            nobak="on" />
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+           outdir="src/java/org/apache/lucene/analysis/standard"
+           nobak="on" />
   </target>
 
   <target name="clean-jflex">
