Index: lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java
===================================================================
--- lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java	(revision 0)
+++ lucene/src/test/org/apache/lucene/analysis/TestUAX29Tokenizer.java	(revision 0)
@@ -0,0 +1,186 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.standard.UAX29Tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestUAX29Tokenizer extends BaseTokenStreamTestCase {
+  
+  public void testHugeDoc() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    char whitespace[] = new char[4094];
+    Arrays.fill(whitespace, ' ');
+    sb.append(whitespace);
+    sb.append("testing 1234");
+    String input = sb.toString();
+    UAX29Tokenizer tokenizer = new UAX29Tokenizer(new StringReader(input));
+    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+  }
+
+  private Analyzer a = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents
+      (String fieldName, Reader reader) {
+
+      Tokenizer tokenizer = new UAX29Tokenizer(reader);
+      return new TokenStreamComponents(tokenizer);
+    }
+  };
+
+  public void testArmenian() throws Exception {
+    assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
+        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
+  }
+  
+  public void testAmharic() throws Exception {
+    assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
+        new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
+  }
+  
+  public void testArabic() throws Exception {
+    assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
+        new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); 
+  }
+  
+  public void testAramaic() throws Exception {
+    assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
+  }
+  
+  public void testBengali() throws Exception {
+    assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
+        new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
+        "শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
+  }
+  
+  public void testFarsi() throws Exception {
+    assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
+        new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
+        "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
+  }
+  
+  public void testGreek() throws Exception {
+    assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
+        new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
+  }
+
+  public void testTibetan() throws Exception {
+    assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+                     new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", 
+                                    "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", 
+                                    "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+  }
+  
+  /*
+   * For chinese, tokenize as char (these can later form bigrams or whatever)
+   * TODO: why do full-width numerics have no word-break prop?
+   */
+  public void testChinese() throws Exception {
+    assertAnalyzesTo(a, "我是中国人。 １２３４ Ｔｅｓｔｓ ",
+        new String[] { "我", "是", "中", "国", "人", "Ｔｅｓｔｓ"});
+  }
+  
+  public void testEmpty() throws Exception {
+    assertAnalyzesTo(a, "", new String[] {});
+    assertAnalyzesTo(a, ".", new String[] {});
+    assertAnalyzesTo(a, " ", new String[] {});
+  }
+  
+  /* test various jira issues this analyzer is related to */
+  
+  public void testLUCENE1545() throws Exception {
+    /*
+     * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
+     * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
+     * Expected result is only on token "moͤchte".
+     */
+    assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
+  }
+  
+  /* Tests from StandardAnalyzer, just to show behavior is similar */
+  public void testAlphanumericSA() throws Exception {
+    // alphanumeric tokens
+    assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
+    assertAnalyzesTo(a, "2B", new String[]{"2B"});
+  }
+
+  public void testDelimitersSA() throws Exception {
+    // other delimiters: "-", "/", ","
+    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+  }
+
+  public void testApostrophesSA() throws Exception {
+    // internal apostrophes: O'Reilly, you're, O'Reilly's
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
+    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+  }
+
+  public void testNumericSA() throws Exception {
+    // floating point, serial, model numbers, ip addresses, etc.
+    // every other segment must have at least one digit
+    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+  }
+
+  public void testTextWithNumbersSA() throws Exception {
+    // numbers
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+  }
+
+  public void testVariousTextSA() throws Exception {
+    // various
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+  }
+
+  public void testKoreanSA() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+  }
+  
+  public void testOffsets() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"David", "has", "5000", "bones"},
+        new int[] {0, 6, 10, 15},
+        new int[] {5, 9, 14, 20});
+  }
+  
+  public void testTypes() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"David", "has", "5000", "bones"},
+        new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
+  }}
Index: lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java	(revision 0)
+++ lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.java	(revision 0)
@@ -0,0 +1,931 @@
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/8/10 10:43 AM */
+
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+
+/**
+ * This class implements the Unicode Text Segmentation algorithm, as specified
+ * in Unicode Standard Annex #29 <http://unicode.org/reports/tr29/>.
+ */
+
+public class UAX29Tokenizer extends Tokenizer {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int YYINITIAL = 0;
+
+  /**
+   * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+   * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+   *                  at the beginning of a line
+   * l is of the form l = 2*k, k a non negative integer
+   */
+  private static final int ZZ_LEXSTATE[] = { 
+     0, 0
+  };
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED = 
+    "\11\0\1\4\1\2\2\3\1\1\22\0\1\4\6\0\1\13\4\0"+
+    "\1\12\1\0\1\13\1\0\12\5\1\11\1\12\5\0\32\10\4\0"+
+    "\1\15\1\0\32\10\12\0\1\3\32\0\1\4\11\0\1\10\2\0"+
+    "\1\7\7\0\1\10\1\0\1\11\2\0\1\10\5\0\27\10\1\0"+
+    "\37\10\1\0\u01ca\10\4\0\14\10\16\0\5\10\7\0\1\10\1\0"+
+    "\1\10\21\0\160\7\5\10\1\0\2\10\2\0\4\10\1\12\7\0"+
+    "\1\10\1\11\3\10\1\0\1\10\1\0\24\10\1\0\123\10\1\0"+
+    "\213\10\1\0\7\7\234\10\13\0\46\10\2\0\1\10\7\0\47\10"+
+    "\1\0\1\12\7\0\55\7\1\0\1\7\1\0\2\7\1\0\2\7"+
+    "\1\0\1\7\10\0\33\10\5\0\4\10\1\11\13\0\4\7\10\0"+
+    "\2\12\2\0\13\7\6\0\52\10\24\7\1\0\12\5\1\0\1\5"+
+    "\1\12\1\0\2\10\1\7\143\10\1\0\1\10\17\7\2\10\2\7"+
+    "\1\0\4\7\2\10\12\5\3\10\2\0\1\10\17\0\1\7\1\10"+
+    "\1\7\36\10\33\7\2\0\131\10\13\7\1\10\16\0\12\5\41\10"+
+    "\11\7\2\10\2\0\1\12\1\0\1\10\5\0\26\10\4\7\1\10"+
+    "\11\7\1\10\3\7\1\10\5\7\322\0\4\7\66\10\2\0\1\7"+
+    "\1\10\21\7\1\0\1\10\5\7\2\0\12\10\2\7\2\0\12\5"+
+    "\1\0\2\10\6\0\7\10\1\0\3\7\1\0\10\10\2\0\2\10"+
+    "\2\0\26\10\1\0\7\10\1\0\1\10\3\0\4\10\2\0\1\7"+
+    "\1\10\7\7\2\0\2\7\2\0\3\7\1\10\10\0\1\7\4\0"+
+    "\2\10\1\0\3\10\2\7\2\0\12\5\2\10\17\0\3\7\1\0"+
+    "\6\10\4\0\2\10\2\0\26\10\1\0\7\10\1\0\2\10\1\0"+
+    "\2\10\1\0\2\10\2\0\1\7\1\0\5\7\4\0\2\7\2\0"+
+    "\3\7\3\0\1\7\7\0\4\10\1\0\1\10\7\0\12\5\2\7"+
+    "\3\10\1\7\13\0\3\7\1\0\11\10\1\0\3\10\1\0\26\10"+
+    "\1\0\7\10\1\0\2\10\1\0\5\10\2\0\1\7\1\10\10\7"+
+    "\1\0\3\7\1\0\3\7\2\0\1\10\17\0\2\10\2\7\2\0"+
+    "\12\5\21\0\3\7\1\0\10\10\2\0\2\10\2\0\26\10\1\0"+
+    "\7\10\1\0\2\10\1\0\5\10\2\0\1\7\1\10\7\7\2\0"+
+    "\2\7\2\0\3\7\10\0\2\7\4\0\2\10\1\0\3\10\2\7"+
+    "\2\0\12\5\1\0\1\10\20\0\1\7\1\10\1\0\6\10\3\0"+
+    "\3\10\1\0\4\10\3\0\2\10\1\0\1\10\1\0\2\10\3\0"+
+    "\2\10\3\0\3\10\3\0\14\10\4\0\5\7\3\0\3\7\1\0"+
+    "\4\7\2\0\1\10\6\0\1\7\16\0\12\5\21\0\3\7\1\0"+
+    "\10\10\1\0\3\10\1\0\27\10\1\0\12\10\1\0\5\10\3\0"+
+    "\1\10\7\7\1\0\3\7\1\0\4\7\7\0\2\7\1\0\2\10"+
+    "\6\0\2\10\2\7\2\0\12\5\22\0\2\7\1\0\10\10\1\0"+
+    "\3\10\1\0\27\10\1\0\12\10\1\0\5\10\2\0\1\7\1\10"+
+    "\7\7\1\0\3\7\1\0\4\7\7\0\2\7\7\0\1\10\1\0"+
+    "\2\10\2\7\2\0\12\5\22\0\2\7\1\0\10\10\1\0\3\10"+
+    "\1\0\27\10\1\0\20\10\3\0\1\10\7\7\1\0\3\7\1\0"+
+    "\4\7\11\0\1\7\10\0\2\10\2\7\2\0\12\5\12\0\6\10"+
+    "\2\0\2\7\1\0\22\10\3\0\30\10\1\0\11\10\1\0\1\10"+
+    "\2\0\7\10\3\0\1\7\4\0\6\7\1\0\1\7\1\0\10\7"+
+    "\22\0\2\7\75\0\1\7\2\0\7\7\14\0\10\7\1\0\12\5"+
+    "\127\0\1\7\2\0\6\7\1\0\2\7\13\0\6\7\2\0\12\5"+
+    "\46\0\1\10\27\0\2\7\6\0\12\5\13\0\1\7\1\0\1\7"+
+    "\1\0\1\7\4\0\2\7\10\10\1\0\44\10\4\0\24\7\1\0"+
+    "\2\7\4\10\4\0\10\7\1\0\44\7\11\0\1\7\144\0\24\7"+
+    "\1\0\12\5\14\0\4\7\4\0\3\7\1\0\3\7\2\0\7\7"+
+    "\3\0\4\7\15\0\14\7\1\0\1\7\12\5\4\7\2\0\46\10"+
+    "\12\0\53\10\1\0\1\10\3\0\u0149\10\1\0\4\10\2\0\7\10"+
+    "\1\0\1\10\1\0\4\10\2\0\51\10\1\0\4\10\2\0\41\10"+
+    "\1\0\4\10\2\0\7\10\1\0\1\10\1\0\4\10\2\0\17\10"+
+    "\1\0\71\10\1\0\4\10\2\0\103\10\4\0\1\7\40\0\20\10"+
+    "\20\0\125\10\14\0\u026c\10\2\0\21\10\1\4\32\10\5\0\113\10"+
+    "\3\0\3\10\17\0\15\10\1\0\4\10\3\7\13\0\22\10\3\7"+
+    "\13\0\22\10\2\7\14\0\15\10\1\0\3\10\1\0\2\7\100\0"+
+    "\40\7\11\0\1\7\2\0\12\5\41\0\3\7\1\4\1\0\12\5"+
+    "\6\0\130\10\10\0\51\10\1\7\1\10\5\0\106\10\12\0\35\10"+
+    "\3\0\14\7\4\0\14\7\12\0\12\5\140\0\21\7\7\0\2\7"+
+    "\6\0\13\5\45\0\27\10\5\7\71\0\12\7\1\0\35\7\2\0"+
+    "\1\7\12\5\6\0\12\5\146\0\5\7\57\10\21\7\7\10\4\0"+
+    "\12\5\21\0\11\7\14\0\3\7\36\10\12\7\3\0\2\10\12\5"+
+    "\106\0\44\10\24\7\10\0\12\5\3\0\3\10\12\5\44\10\122\0"+
+    "\3\7\1\0\25\7\4\10\1\7\4\10\1\7\15\0\300\10\47\7"+
+    "\26\0\3\7\u0116\10\2\0\6\10\2\0\46\10\2\0\6\10\2\0"+
+    "\10\10\1\0\1\10\1\0\1\10\1\0\1\10\1\0\37\10\2\0"+
+    "\65\10\1\0\7\10\1\0\1\10\3\0\3\10\1\0\7\10\3\0"+
+    "\4\10\2\0\6\10\4\0\15\10\5\0\3\10\1\0\7\10\3\0"+
+    "\13\4\1\0\4\7\10\0\2\13\12\0\1\13\2\0\1\11\2\3"+
+    "\5\7\1\4\17\0\2\15\3\0\1\12\17\0\1\15\12\0\1\4"+
+    "\5\7\5\0\6\7\1\0\1\10\15\0\1\10\20\0\5\10\73\0"+
+    "\41\7\21\0\1\10\4\0\1\10\2\0\12\10\1\0\1\10\3\0"+
+    "\5\10\6\0\1\10\1\0\1\10\1\0\1\10\1\0\4\10\1\0"+
+    "\13\10\2\0\4\10\5\0\5\10\4\0\1\10\21\0\51\10\u032d\0"+
+    "\64\10\u0716\0\57\10\1\0\57\10\1\0\205\10\6\0\4\10\3\7"+
+    "\16\0\46\10\12\0\66\10\11\0\1\10\20\0\27\10\11\0\7\10"+
+    "\1\0\7\10\1\0\7\10\1\0\7\10\1\0\7\10\1\0\7\10"+
+    "\1\0\7\10\1\0\7\10\1\0\40\7\57\0\1\10\120\0\32\6"+
+    "\1\0\131\6\14\0\326\6\52\0\1\4\4\0\1\10\1\0\1\6"+
+    "\31\0\11\6\6\7\1\0\5\14\2\0\3\6\2\10\134\0\2\7"+
+    "\2\14\3\0\133\14\1\0\4\14\5\0\51\10\3\0\136\10\21\0"+
+    "\30\10\70\0\20\14\320\0\57\14\1\0\130\14\250\0\u19b6\6\112\0"+
+    "\u51cc\6\64\0\u048d\10\103\0\56\10\2\0\u010d\10\3\0\20\10\12\5"+
+    "\2\10\24\0\40\10\2\0\15\10\4\7\11\0\2\7\1\0\31\10"+
+    "\10\0\120\10\2\7\45\0\11\10\2\0\147\10\2\0\2\10\156\0"+
+    "\7\10\1\7\3\10\1\7\4\10\1\7\27\10\5\7\30\0\64\10"+
+    "\14\0\2\7\62\10\21\7\13\0\12\5\6\0\22\7\6\10\3\0"+
+    "\1\10\4\0\12\5\34\10\10\7\2\0\27\10\15\7\14\0\35\10"+
+    "\3\0\4\7\57\10\16\7\16\0\1\10\12\5\46\0\51\10\16\7"+
+    "\11\0\3\10\1\7\10\10\2\7\2\0\12\5\41\0\1\7\64\0"+
+    "\1\7\1\0\3\7\2\0\2\7\5\0\2\7\1\0\1\7\376\0"+
+    "\43\10\10\7\1\0\2\7\2\0\12\5\6\0\u2ba4\10\14\0\27\10"+
+    "\4\0\61\10\u2104\0\u012e\6\2\0\76\6\2\0\152\6\46\0\7\10"+
+    "\14\0\5\10\5\0\1\10\1\7\12\10\1\0\15\10\1\0\5\10"+
+    "\1\0\1\10\1\0\2\10\1\0\2\10\1\0\154\10\41\0\u016b\10"+
+    "\22\0\100\10\2\0\66\10\50\0\14\10\4\0\20\7\1\12\2\0"+
+    "\1\11\1\12\13\0\7\7\14\0\2\15\30\0\3\15\1\12\1\0"+
+    "\1\13\1\0\1\12\1\11\32\0\5\10\1\0\207\10\2\0\1\7"+
+    "\7\0\1\13\4\0\1\12\1\0\1\13\13\0\1\11\1\12\5\0"+
+    "\32\10\4\0\1\15\1\0\32\10\13\0\70\14\2\7\37\10\3\0"+
+    "\6\10\2\0\6\10\2\0\6\10\2\0\3\10\34\0\3\7\4\0";
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /** 
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\1\0\4\1\1\2\3\3\1\1\1\0\1\4\1\5"+
+    "\1\6\2\0\1\7\1\10\1\11\2\0\1\12\1\13"+
+    "\1\12\1\0\1\12\1\4\1\14\1\15\1\16\1\17"+
+    "\1\20\1\0\1\21\1\7\1\22\1\0\1\23\1\0"+
+    "\1\13\2\0";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[42];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /** 
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\16\0\34\0\52\0\70\0\106\0\124\0\142"+
+    "\0\160\0\176\0\214\0\232\0\250\0\266\0\250\0\304"+
+    "\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
+    "\0\u0142\0\u0150\0\u015e\0\232\0\232\0\232\0\232\0\232"+
+    "\0\340\0\232\0\232\0\232\0\u0118\0\232\0\u0134\0\232"+
+    "\0\u0150\0\u015e";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[42];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /** 
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\2\1\3\2\4\1\5\1\6\1\7\1\2\1\10"+
+    "\3\2\1\11\1\12\1\0\3\13\3\0\1\14\6\0"+
+    "\2\15\1\16\31\15\1\0\3\17\3\0\1\14\7\0"+
+    "\3\20\1\0\1\21\1\0\1\22\1\23\1\0\2\24"+
+    "\1\0\1\21\1\0\3\25\3\0\1\26\5\0\1\27"+
+    "\1\0\3\25\1\0\1\27\1\0\1\30\1\27\1\31"+
+    "\1\0\1\31\1\0\1\27\1\0\3\25\3\0\1\32"+
+    "\4\0\2\27\1\0\3\13\1\0\1\23\1\0\1\33"+
+    "\1\23\3\0\2\23\16\34\16\0\16\35\16\36\16\37"+
+    "\16\40\5\0\1\21\1\0\1\41\1\23\1\0\2\24"+
+    "\1\0\1\21\16\42\5\0\1\43\1\0\1\24\6\0"+
+    "\16\44\7\0\1\45\5\0\1\27\16\46\5\0\1\27"+
+    "\1\0\1\47\1\27\1\31\1\0\1\31\1\0\1\27"+
+    "\7\0\1\31\1\50\14\0\1\51\4\0\2\27\5\0"+
+    "\1\23\1\0\1\52\1\23\3\0\2\23";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[364];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\1\0\11\1\1\0\1\11\2\1\2\0\3\1\2\0"+
+    "\3\1\1\0\2\1\5\11\1\0\3\11\1\0\1\11"+
+    "\1\0\1\11\2\0";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[42];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /** denotes if the user-EOF-code has already been executed */
+  private boolean zzEOFDone;
+
+  /* user code: */
+  public static final String WORD_TYPE = "<WORD>";
+  public static final String NUM_TYPE = "<NUM>";
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt 
+    = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  private int matchStart; 
+  private int posIncr;
+  private boolean hasLetter = false;
+  private boolean hasNumber = false;
+  private boolean isTokenTooLong; // tracks partial match token lengths
+
+  
+  /**
+   * @param source The AttributeSource to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    zzReader = input;
+  }
+  
+  /**
+   * @param factory The AttributeFactory to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input); 
+    zzReader = input;
+  }
+  
+  @Override
+  public final void end() {
+    // set final offset
+    int finalOffset = correctOffset(yychar + yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset(Reader reader) throws IOException {
+    super.reset(reader);
+    if (zzBuffer.length > ZZ_BUFFERSIZE) {
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+    }
+    yyreset(reader);
+  }
+
+  /**
+   * Appends the current (potentially partial token) match to this TokenStream's 
+   * CharTermAttribute.
+   * <p/> 
+   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
+   * and false is returned.
+   * 
+   * @param isTokenBoundary true if there is a token boundary after the current
+   *  match; false otherwise.
+   * @return true there is a token available; false otherwise 
+   */
+  private boolean updateAttributes(boolean isTokenBoundary) {
+    boolean isTokenAvailable = false;
+    if (isTokenTooLong || termAtt.length() + yylength() > maxTokenLength) {
+      // previous or current partial match made the current token too long
+      if (isTokenBoundary) {
+        // When we skip a too-long token, we still increment the position
+        // increment, but only if the token would have otherwise been output
+        // (that is, only if it contains Letters and/or Numbers.)
+        if (hasLetter || hasNumber) {
+          ++posIncr;
+        }
+        clearAttributes();
+        isTokenTooLong = false;  // reset for the next token
+      } else {
+        // when the next token boundary is encountered, the CharTermAttribute
+        // will be emptied - no need to do it here.
+        isTokenTooLong = true;
+      }
+    } else if (isTokenBoundary) {
+      if (hasLetter || hasNumber) {
+        if (0 == termAtt.length()) {
+          matchStart = yychar;
+        }
+        int newLength = termAtt.length() + yylength();
+        char targetBuf[] = termAtt.resizeBuffer(newLength);
+        System.arraycopy(zzBuffer, zzStartRead, targetBuf, termAtt.length(), yylength());
+        termAtt.setLength(newLength);
+        posIncrAtt.setPositionIncrement(posIncr);
+        offsetAtt.setOffset(correctOffset(matchStart), 
+                            correctOffset(matchStart + termAtt.length()));
+        typeAtt.setType(hasLetter ? WORD_TYPE : NUM_TYPE);
+        isTokenAvailable = true;
+      } else {
+        clearAttributes();
+      }
+    } else { // this is not a token boundary
+      if (0 == termAtt.length()) {
+        matchStart = yychar;
+      }
+      int newLength = termAtt.length() + yylength();
+      char targetBuf[] = termAtt.resizeBuffer(newLength);
+      System.arraycopy(zzBuffer, zzStartRead, targetBuf, termAtt.length(), yylength());
+      termAtt.setLength(newLength);
+    }
+    return isTokenAvailable;
+  }
+
+  /** 
+   * Set the max allowed token length.  Any token longer than this is skipped.
+   * @param length the new max allowed token length
+   */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /**
+   * Returns the max allowed token length.  Any token longer than this is 
+   * skipped.
+   * @return the max allowed token length 
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    // This method is required because of two JFlex limitations:
+    // 1. No way to insert code at the beginning of the generated scanning
+    //    get-next-token method; and
+    // 2. No way to declare @Override on the generated scanning method.
+    clearAttributes();
+    posIncr = 1;
+    hasLetter = hasNumber = false;
+    return getNextToken();
+  }
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  public UAX29Tokenizer(java.io.Reader in) {
+    super(in);
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  public UAX29Tokenizer(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /** 
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 2060) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead > 0) {
+      zzEndRead+= numRead;
+      return false;
+    }
+    // unlikely but not impossible: read 0 characters, but not at end of stream    
+    if (numRead == 0) {
+      int c = zzReader.read();
+      if (c == -1) {
+        return true;
+      } else {
+        zzBuffer[zzEndRead++] = (char) c;
+        return false;
+      }     
+    }
+
+	// numRead < 0
+    return true;
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEOFDone = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public boolean getNextToken() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      yychar+= zzMarkedPosL-zzStartRead;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+      zzState = ZZ_LEXSTATE[zzLexicalState];
+
+
+      zzForAction: {
+        while (true) {
+    
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 14: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 2;
+          { /* ignore whitespace */
+          }
+        case 20: break;
+        case 6: 
+          { /* ignore whitespace */
+          }
+        case 21: break;
+        case 13: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { /* ignore whitespace */
+          }
+        case 22: break;
+        case 16: 
+          // lookahead expression with fixed lookahead length
+          yypushback(2);
+          { hasNumber = true; updateAttributes(false);
+          }
+        case 23: break;
+        case 7: 
+          // lookahead expression with fixed lookahead length
+          yypushback(1);
+          { hasNumber = true; updateAttributes(false);
+          }
+        case 24: break;
+        case 8: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { hasNumber = true; updateAttributes(false);
+          }
+        case 25: break;
+        case 5: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { /* Ignore whitespace */
+          }
+        case 26: break;
+        case 17: 
+          // lookahead expression with fixed lookahead length
+          yypushback(2);
+          { updateAttributes(false);
+          }
+        case 27: break;
+        case 4: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { updateAttributes(false);
+          }
+        case 28: break;
+        case 9: 
+          // lookahead expression with fixed lookahead length
+          yypushback(1);
+          { updateAttributes(false);
+          }
+        case 29: break;
+        case 19: 
+          // lookahead expression with fixed lookahead length
+          yypushback(2);
+          { hasLetter = true; updateAttributes(false);
+          }
+        case 30: break;
+        case 10: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { hasLetter = true; updateAttributes(false);
+          }
+        case 31: break;
+        case 11: 
+          // lookahead expression with fixed lookahead length
+          yypushback(1);
+          { hasLetter = true; updateAttributes(false);
+          }
+        case 32: break;
+        case 1: 
+          { if (updateAttributes(true)) return true;
+          }
+        case 33: break;
+        case 12: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { if (updateAttributes(true)) return true;
+          }
+        case 34: break;
+        case 15: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { hasNumber = true; if (updateAttributes(true)) return true;
+          }
+        case 35: break;
+        case 2: 
+          { hasNumber = true; if (updateAttributes(true)) return true;
+          }
+        case 36: break;
+        case 18: 
+          // lookahead expression with fixed base length
+          zzMarkedPos = zzStartRead + 1;
+          { hasLetter = true; if (updateAttributes(true)) return true;
+          }
+        case 37: break;
+        case 3: 
+          { hasLetter = true; if (updateAttributes(true)) return true;
+          }
+        case 38: break;
+        default: 
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+              {
+                return false;
+              }
+          } 
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}
Index: lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex
===================================================================
--- lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex	(revision 0)
+++ lucene/src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex	(revision 0)
@@ -0,0 +1,333 @@
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+
+/**
+ * This class implements the Unicode Text Segmentation algorithm, as specified
+ * in Unicode Standard Annex #29 <http://unicode.org/reports/tr29/>.
+ */
+%%
+
+%unicode 5.2
+%public
+%class UAX29Tokenizer
+%extends Tokenizer
+%type boolean
+%function getNextToken
+%char
+
+%init{
+  super(in);
+%init}
+
+
+%{
+  public static final String WORD_TYPE = "<WORD>";
+  public static final String NUM_TYPE = "<NUM>";
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt 
+    = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  private int matchStart; 
+  private int posIncr;
+  private boolean hasLetter = false;
+  private boolean hasNumber = false;
+  private boolean isTokenTooLong; // tracks partial match token lengths
+
+  
+  /**
+   * @param source The AttributeSource to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    zzReader = input;
+  }
+  
+  /**
+   * @param factory The AttributeFactory to use
+   * @param input The input reader
+   */
+  public UAX29Tokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input); 
+    zzReader = input;
+  }
+  
+  @Override
+  public final void end() {
+    // set final offset
+    int finalOffset = correctOffset(yychar + yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset(Reader reader) throws IOException {
+    super.reset(reader);
+    if (zzBuffer.length > ZZ_BUFFERSIZE) {
+      zzBuffer = new char[ZZ_BUFFERSIZE];
+    }
+    yyreset(reader);
+  }
+
+  /**
+   * Appends the current (potentially partial token) match to this TokenStream's 
+   * CharTermAttribute.
+   * <p/> 
+   * If maxTokenLength is exceeded, the CharTermAttribute is set back to empty
+   * and false is returned.
+   * 
+   * @param isTokenBoundary true if there is a token boundary after the current
+   *  match; false otherwise.
+   * @return true there is a token available; false otherwise 
+   */
+  private boolean updateAttributes(boolean isTokenBoundary) {
+    boolean isTokenAvailable = false;
+    if (isTokenTooLong || termAtt.length() + yylength() > maxTokenLength) {
+      // previous or current partial match made the current token too long
+      if (isTokenBoundary) {
+        // When we skip a too-long token, we still increment the position
+        // increment, but only if the token would have otherwise been output
+        // (that is, only if it contains Letters and/or Numbers.)
+        if (hasLetter || hasNumber) {
+          ++posIncr;
+        }
+        clearAttributes();
+        isTokenTooLong = false;  // reset for the next token
+      } else {
+        // when the next token boundary is encountered, the CharTermAttribute
+        // will be emptied - no need to do it here.
+        isTokenTooLong = true;
+      }
+    } else if (isTokenBoundary) {
+      if (hasLetter || hasNumber) {
+        if (0 == termAtt.length()) {
+          matchStart = yychar;
+        }
+        int newLength = termAtt.length() + yylength();
+        char targetBuf[] = termAtt.resizeBuffer(newLength);
+        System.arraycopy(zzBuffer, zzStartRead, targetBuf, termAtt.length(), yylength());
+        termAtt.setLength(newLength);
+        posIncrAtt.setPositionIncrement(posIncr);
+        offsetAtt.setOffset(correctOffset(matchStart), 
+                            correctOffset(matchStart + termAtt.length()));
+        typeAtt.setType(hasLetter ? WORD_TYPE : NUM_TYPE);
+        isTokenAvailable = true;
+      } else {
+        clearAttributes();
+      }
+    } else { // this is not a token boundary
+      if (0 == termAtt.length()) {
+        matchStart = yychar;
+      }
+      int newLength = termAtt.length() + yylength();
+      char targetBuf[] = termAtt.resizeBuffer(newLength);
+      System.arraycopy(zzBuffer, zzStartRead, targetBuf, termAtt.length(), yylength());
+      termAtt.setLength(newLength);
+    }
+    return isTokenAvailable;
+  }
+
+  /** 
+   * Set the max allowed token length.  Any token longer than this is skipped.
+   * @param length the new max allowed token length
+   */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /**
+   * Returns the max allowed token length.  Any token longer than this is 
+   * skipped.
+   * @return the max allowed token length 
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    // This method is required because of two JFlex limitations:
+    // 1. No way to insert code at the beginning of the generated scanning
+    //    get-next-token method; and
+    // 2. No way to declare @Override on the generated scanning method.
+    clearAttributes();
+    posIncr = 1;
+    hasLetter = hasNumber = false;
+    return getNextToken();
+  }
+%}
+
+%%
+
+// Break at the start and end of text.
+// WB1. 	sot 	÷ 	
+// WB2. 		÷ 	eot
+<<EOF>> { return false; }
+
+
+//
+// Do not break within CRLF.
+//
+// WB3. 	CR 	×  LF
+//
+\p{WB:CR} \p{WB:LF} / [^]  {  /* ignore whitespace */  }
+\p{WB:CR} \p{WB:LF}        {  /* ignore whitespace */  }
+
+
+// Otherwise break before and after Newlines (including CR and LF)
+//
+// WB3a. 	(Newline | CR | LF) 	÷
+[\p{WB:Newline}\p{WB:CR}\p{WB:LF}] / [^]{2} {  /* ignore whitespace */  }
+[\p{WB:Newline}\p{WB:CR}\p{WB:LF}] / [^]    {  /* Ignore whitespace */  }
+
+
+// WB3b. 	  	÷ 	(Newline | CR | LF)
+//
+\s                                            / [\p{WB:Newline}\p{WB:CR}\p{WB:LF}] [^] {  /* ignore whitespace */  }
+\p{WB:Numeric}                                / [\p{WB:Newline}\p{WB:CR}\p{WB:LF}] [^] { hasNumber = true; if (updateAttributes(true)) return true; }
+[\p{WB:Katakana}\p{WB:ALetter}\p{Script:Han}] / [\p{WB:Newline}\p{WB:CR}\p{WB:LF}] [^] { hasLetter = true; if (updateAttributes(true)) return true; }
+[^]                                           / [\p{WB:Newline}\p{WB:CR}\p{WB:LF}] [^] { if (updateAttributes(true)) return true; }
+
+
+// Ignore Format and Extend characters, except when they appear at the 
+// beginning of a region of text.
+//
+// (See Section 6.2, Replacing Ignore Rules.)
+//
+// WB4. 	X (Extend | Format)* 	? 	X
+//
+//      --> [^ Newline CR LF ] × [Format Extend]
+//
+\p{WB:Numeric}                                / [\p{WB:Format}\p{WB:Extend}] { hasNumber = true; updateAttributes(false); }
+[\p{WB:Katakana}\p{WB:ALetter}\p{Script:Han}] / [\p{WB:Format}\p{WB:Extend}] { hasLetter = true; updateAttributes(false); }
+[^\p{WB:Newline}\p{WB:CR}\p{WB:LF}]           / [\p{WB:Format}\p{WB:Extend}] { updateAttributes(false); }
+
+
+// Do not break between most letters.
+//
+// WB5. 	ALetter 	× 	ALetter
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ALetter} [^] { hasLetter = true; updateAttributes(false); }
+\p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ALetter}     { hasLetter = true; updateAttributes(false); }
+
+
+// Do not break letters across certain punctuation.
+//
+// WB6. 	ALetter × (MidLetter | MidNumLet) ALetter
+// WB7. 	ALetter (MidLetter | MidNumLet) × ALetter
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ALetter} { hasLetter = true; updateAttributes(false); }
+
+
+// Do not break within sequences of digits, or digits adjacent to letters 
+// (“3a”, or “A3”).
+//
+// WB8. 	Numeric 	× 	Numeric
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:Numeric} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Numeric} [^] { hasNumber = true; updateAttributes(false); }
+\p{WB:Numeric} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Numeric}     { hasNumber = true; updateAttributes(false); }
+
+
+// WB9. 	ALetter 	× 	Numeric
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Numeric} [^] { hasLetter = true; updateAttributes(false); }
+\p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Numeric}     { hasLetter = true; updateAttributes(false); }
+
+
+// WB10. 	Numeric 	× 	ALetter
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:Numeric} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ALetter} [^] { updateAttributes(false); }
+\p{WB:Numeric} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ALetter}     { updateAttributes(false); }
+
+
+// Do not break within sequences, such as “3.2” or “3,456.789”.
+//
+// WB11. 	Numeric (MidNum | MidNumLet) 	× 	Numeric
+// WB12. 	Numeric 	× 	(MidNum | MidNumLet) Numeric
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:Numeric} [\p{WB:Format}\p{WB:Extend}]* [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Numeric} { hasNumber = true; updateAttributes(false); }
+
+
+// Do not break between Katakana.
+//
+// WB13. 	Katakana 	× 	Katakana
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Katakana} [^] { hasLetter = true; updateAttributes(false); }
+\p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]* / \p{WB:Katakana}     { hasLetter = true; updateAttributes(false); }
+
+
+// Do not break from extenders.
+//
+// WB13a. 	(ALetter | Numeric | Katakana | ExtendNumLet) 	× 	ExtendNumLet
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:Numeric}                                [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} [^] { hasNumber = true; updateAttributes(false); }
+[\p{WB:ALetter}\p{WB:Katakana}\p{Script:Han}] [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} [^] { hasLetter = true; updateAttributes(false); }
+\p{WB:ExtendNumLet}                           [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} [^] { updateAttributes(false); }
+\p{WB:Numeric}                                [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} { hasNumber = true; updateAttributes(false); }
+[\p{WB:ALetter}\p{WB:Katakana}\p{Script:Han}] [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} { hasLetter = true; updateAttributes(false); }
+\p{WB:ExtendNumLet}                           [\p{WB:Format}\p{WB:Extend}]* / \p{WB:ExtendNumLet} { updateAttributes(false); }
+
+
+// WB13b. 	ExtendNumLet 	× 	(ALetter | Numeric | Katakana)
+//
+// [included WB4. 	X (Extend | Format)* 	? 	X]
+//
+\p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]* / [\p{WB:ALetter}\p{WB:Numeric}\p{WB:Katakana}] [^] { updateAttributes(false); }
+\p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]* / [\p{WB:ALetter}\p{WB:Numeric}\p{WB:Katakana}]     { updateAttributes(false); }
+
+
+// Otherwise, break everywhere (including around ideographs).
+//
+// WB14. 	Any 	÷ 	Any
+//
+\p{WB:Numeric}                                { hasNumber = true; if (updateAttributes(true)) return true; }
+[\p{WB:ALetter}\p{WB:Katakana}\p{Script:Han}] { hasLetter = true; if (updateAttributes(true)) return true; }
+[^]                                           { if (updateAttributes(true)) return true; }
Index: lucene/build.xml
===================================================================
--- lucene/build.xml	(revision 941990)
+++ lucene/build.xml	(working copy)
@@ -661,6 +661,9 @@
     <jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl31.jflex"
            outdir="src/java/org/apache/lucene/analysis/standard"
            nobak="on" />
+    <jflex file="src/java/org/apache/lucene/analysis/standard/UAX29Tokenizer.jflex"
+           outdir="src/java/org/apache/lucene/analysis/standard"
+           nobak="on" />
   </target>
 
   <target name="clean-jflex">
