Index: contrib/icu/build.xml
===================================================================
--- contrib/icu/build.xml	(revision 0)
+++ contrib/icu/build.xml	(revision 0)
@@ -0,0 +1,73 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="icu" default="default">
+
+  <description>
+    Lucene Unicode/ICU Analyzer: Analyzer for General Unicode Text
+  </description>
+
+  <path id="additional.dependencies">
+    <fileset dir="lib" includes="icu4j-*.jar"/>
+  </path>
+
+  <pathconvert property="project.classpath"
+               targetos="unix"
+               refid="additional.dependencies"
+  />
+
+  <import file="../contrib-build.xml"/>
+
+  <target name="init" depends="common.init,rbbi-uptodate-check,compile-rbbi"/>
+
+  <target name="compile" depends="init">
+    <antcall target="common.compile" inheritRefs="true" />
+  </target>
+
+  <target name="rbbi-uptodate-check">
+	<uptodate property="data.files.uptodate">
+	  <srcfiles dir="src/resources" includes="**/*.rbbi" />
+	    <mapper type="glob" from="*.rbbi" to="*.brk"/>
+	</uptodate>
+  </target>
+	
+  <target name="compile-rbbi" depends="compile-tools" unless="data.files.uptodate">
+    <java
+      classname="org.apache.lucene.icu.RBBIRuleCompiler"
+      dir="src/resources"
+      fork="true"
+      failonerror="true"
+      >
+      <classpath>
+      	<path refid="additional.dependencies"/>
+      	<pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+	
+  <target name="compile-tools">
+    <compile
+	   srcdir="src/tools/java"
+	   destdir="${build.dir}/classes/tools"
+  	   >
+	  <classpath refid="classpath"/>
+    </compile>
+  </target>
+	
+</project>

Property changes on: contrib\icu\build.xml
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/pom.xml.template
===================================================================
--- contrib/icu/pom.xml.template	(revision 0)
+++ contrib/icu/pom.xml.template	(revision 0)
@@ -0,0 +1,46 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+    
+    http://www.apache.org/licenses/LICENSE-2.0
+    
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+  -->
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-contrib</artifactId>
+    <version>@version@</version>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-icu</artifactId>
+  <name>
+    Lucene Unicode/ICU Analyzer
+  </name>
+  <version>@version@</version>
+  <description>    
+    Analyzer for General Unicode Text
+  </description>
+  <packaging>jar</packaging>
+  <dependencies>
+    <dependency>
+      <groupId>com.ibm.icu</groupId>
+      <artifactId>icu4j</artifactId>
+      <version>${icu-version}</version>
+    </dependency>
+  </dependencies>
+</project>
Index: contrib/icu/lib/ICU-LICENSE.txt
===================================================================
--- contrib/icu/lib/ICU-LICENSE.txt	(revision 0)
+++ contrib/icu/lib/ICU-LICENSE.txt	(revision 0)
@@ -0,0 +1,33 @@
+ICU License - ICU 1.8.1 and later
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2008 International Business Machines Corporation and others
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do so,
+provided that the above copyright notice(s) and this permission notice appear
+in all copies of the Software and that both the above copyright notice(s) and
+this permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
+LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall not
+be used in advertising or otherwise to promote the sale, use or other
+dealings in this Software without prior written authorization of the
+copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the property of
+their respective owners.

Property changes on: contrib\icu\lib\ICU-LICENSE.txt
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/lib/icu4j-4_2.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: contrib\icu\lib\icu4j-4_2.jar
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUAnalyzer.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUAnalyzer.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUAnalyzer.java	(revision 0)
@@ -0,0 +1,119 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.icu.tokenizer.ICUTokenizer;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Normalizer;
+
+/**
+ * Analyzer based on ICU (International Components for Unicode).
+ * <p>
+ * By default, the following processing is performed:
+ * <ul>
+ * <li>Text is segmented into words according to the Unicode Standard.
+ * <li>Tokens are marked with a type: WORD, NUM, IDEO, or KANA.
+ * <li>Tokens are tagged with their ISO 15924 script code constants from ICU
+ * UScript (placed in the token flags).
+ * <li>Tokens are case-folded with full unicode case-folding, including
+ * FC_NFKC_Closure mappings.
+ * <li>All numeric digits are standardized to [0-9]
+ * <li>Characters that only affect display, such as formatting and variation
+ * selectors, are removed from tokens.
+ * <li>Text is normalized to NFKC (canonical and compatibility equivalence)
+ * </ul>
+ * </p>
+ * TODO: Add better description and make it easier to tailor the tokenizer.
+ * 
+ */
+public final class ICUAnalyzer extends Analyzer {
+  private final Normalizer.Mode mode;
+  
+  /* for these scripts, things are not tokenized into real words.
+   * instead they are tokenized into something at subword level (such as syllable)
+   * form bigrams out of these subwords if they are adjacent.
+   */
+  private final int bigramScripts[] = { 
+      UScript.HAN, 
+      UScript.KHMER,
+      UScript.LAO,
+      UScript.MYANMAR
+  };
+
+  /**
+   * Create a new ICU Analyzer with the default normalization mode (NFKC)
+   */
+  public ICUAnalyzer() {
+    this(Normalizer.NFKC);
+  }
+
+  /**
+   * Create a new ICU Analyzer with the supplied normalization mode.
+   * 
+   * @param mode Normalization mode
+   */
+  public ICUAnalyzer(Normalizer.Mode mode) {
+    super();
+    this.mode = mode;
+  }
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream stream = new ICUTokenizer(reader);
+    stream = new ICUCaseFoldingFilter(stream, mode);
+    stream = new ICUDigitFoldingFilter(stream);
+    stream = new ICUFormatFilter(stream);
+    stream = new ICUNormalizationFilter(stream, mode);
+    stream = new ICUBigramFilter(stream, mode, bigramScripts);
+    return stream;
+  }
+
+  private class SavedStreams {
+    ICUTokenizer tokenStream;
+
+    TokenStream filteredTokenStream;
+  }
+
+  public TokenStream reusableTokenStream(String fieldName, Reader reader)
+      throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      setPreviousTokenStream(streams);
+      streams.tokenStream = new ICUTokenizer(reader);
+      streams.filteredTokenStream = new ICUCaseFoldingFilter(
+          streams.tokenStream, mode);
+      streams.filteredTokenStream = new ICUDigitFoldingFilter(
+          streams.filteredTokenStream);
+      streams.filteredTokenStream = new ICUFormatFilter(
+          streams.filteredTokenStream);
+      streams.filteredTokenStream = new ICUNormalizationFilter(
+          streams.filteredTokenStream, mode);
+      streams.filteredTokenStream = new ICUBigramFilter(streams.filteredTokenStream, mode, bigramScripts);
+    } else {
+      streams.tokenStream.reset(reader);
+      streams.filteredTokenStream.reset();
+    }
+    return streams.filteredTokenStream;
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUBigramFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUBigramFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUBigramFilter.java	(revision 0)
@@ -0,0 +1,200 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Normalizer;
+
+/**
+ * A filter for forming word bigrams for selected writing systems
+ * These bigrams are injected into the stream with a position increment gap of zero.
+ * 
+ * The following special conditions also apply:
+ * <ul>
+ *   <li>The bigram tokens are normalized under the supplied Mode.
+ *   <li>The bigram tokens are only formed from tokens written in the writing system scripts supplied.
+ *   <li>To form a bigram, adjacent tokens must be in the same writing system.
+ * </ul>
+ *
+ */
+public final class ICUBigramFilter extends TokenFilter {
+  private State previousState;
+  private int previousEndOffset;
+  private int previousScript;
+  
+  private State currentState;
+  private char[] currentTermBuffer;
+  private int currentTermLen;
+  private int currentEndOffset;
+  
+  private int step;
+
+  private TermAttribute termAtt;
+  private OffsetAttribute offsetAtt;
+  private FlagsAttribute flagsAtt;
+  private PositionIncrementAttribute posIncAtt;
+  
+  private final BitSet scriptMask;
+  private final Normalizer.Mode mode;
+
+  public ICUBigramFilter(TokenStream input, Normalizer.Mode mode, int scripts[]) {
+    super(input);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+    posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+    
+    this.mode = mode;
+    scriptMask = new BitSet(UScript.CODE_LIMIT);
+    for (int i = 0; i < scripts.length; i++)
+      scriptMask.set(scripts[i]);
+    
+  }
+  
+  public boolean incrementToken() throws IOException {
+    /*
+     * Not the best code from a readability perspective, but here is the idea:
+     * The filter aims to concatenate two tokens "left" and "right".
+     * 
+     * The tokenfilter is a state machine with 3 states (step 0, 1, and 2)
+     * [0] (start state, no buffered data)
+     * [1] ("left" token has been buffered and provided to consumer)
+     * [2] ("right" token has been provided to consumer)
+     * 
+     */
+    switch(step) {
+      case 0:
+        return step0();
+      case 1:
+        return step1();
+      default: /* 2 */
+        return step2();
+    }
+  }
+  
+  public void reset() throws IOException {
+    super.reset();
+    step = 0;
+  }
+  
+  private boolean step0() throws IOException {
+    if (input.incrementToken()) {
+      if (isSupportedScript()) {
+        previousState = captureState();
+        previousEndOffset = offsetAtt.endOffset();
+        previousScript = flagsAtt.getFlags();
+        step = 1;
+      } 
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  private boolean step1() throws IOException {
+    if (input.incrementToken()) {
+      if (isCompatible()) {
+        step = 2;
+        currentState = captureState();
+        currentEndOffset = offsetAtt.endOffset();
+        currentTermBuffer = (char[]) termAtt.termBuffer().clone();
+        currentTermLen = termAtt.termLength();
+      } else {
+        if (isSupportedScript()) {
+          previousState = captureState();
+          previousEndOffset = offsetAtt.endOffset();
+          previousScript = flagsAtt.getFlags();
+          step = 1;
+        } else {
+          step = 0;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  private boolean step2() throws IOException {
+    clearAttributes(); // here we are the source of tokens
+    restoreState(previousState);
+    safeAppend(currentTermBuffer, currentTermLen);
+    offsetAtt.setOffset(offsetAtt.startOffset(), currentEndOffset);
+    posIncAtt.setPositionIncrement(0);
+    previousState = currentState;
+    previousEndOffset = currentEndOffset;
+    step = 1;
+    return true;
+  }
+  
+  private boolean isSupportedScript() {
+    return scriptMask.get(flagsAtt.getFlags());
+  }
+  
+  private boolean isCompatible() {
+    return flagsAtt.getFlags() == previousScript &&
+      offsetAtt.startOffset() == previousEndOffset;   
+  }
+  
+  private void safeAppend(char right[], int rightLen) {
+    int leftLen = termAtt.termLength();
+    int newLen = leftLen + rightLen;
+    char left[] = termAtt.resizeTermBuffer(newLen);
+
+    /*
+     * Sub-optimal implementation:
+     * 
+     * http://bugs.icu-project.org/trac/ticket/7120: Normalizer.concatenate (icu4j) gives incorrect results
+     * 
+     * When this is fixed should be able to use ICUVersion to use a more optimal approach for >= 4.4
+     */
+
+    System.arraycopy(right, 0, left, leftLen, rightLen);
+    
+    if (Normalizer.quickCheck(left, 0, newLen, mode, 0) == Normalizer.YES) {
+      termAtt.setTermLength(newLen);
+      return;
+    }
+    
+    char buffer[] = new char[newLen << 1];
+    
+    do {
+      try {
+        final int normalizedLen = Normalizer.normalize(left, 0, newLen, buffer, 0,
+            buffer.length, mode, 0);
+        termAtt.setTermBuffer(buffer, 0, normalizedLen);
+        return;
+      } catch (IndexOutOfBoundsException e) {
+        // technically, ICU encodes the necessary size as a String in the
+        // exception, but don't depend on that...
+        buffer = new char[ArrayUtil.getNextSize(buffer.length << 1)];
+      }
+    } while (true);
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUBigramFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUCaseFoldingFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUCaseFoldingFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUCaseFoldingFilter.java	(revision 0)
@@ -0,0 +1,291 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.Normalizer;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A {@link TokenFilter} that provides Unicode default caseless matching.
+ * <p>
+ * Default caseless matching, or case-folding is more than just conversion to
+ * lowercase. For example, it handles cases such as the Greek sigma, so that
+ * "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
+ * </p>
+ * <p>
+ * Case-folding is still only an approximation of the language-specific rules
+ * governing case. If the specific language is known, consider using
+ * ICUCollationKeyFilter and indexing collation keys instead. This filter
+ * performs the "full" case-folding specified in the Unicode standard, and this
+ * may change the length of the term. For example, the German ß is case-folded
+ * to the string 'ss'.
+ * </p>
+ * <p>
+ * This filter respects both canonical and compatibility equivalence. The
+ * default caseless matching in the Unicode standard respects canonical
+ * equivalence. Additional logic has been added to this filter to respect
+ * compatibility equivalence, avoiding an extra normalization. This logic also
+ * ensures the output is <i>closed</i> under all compatibility forms. You do not
+ * need to normalize before folding, or fold and normalize twice. Closure means
+ * all of the below conditions are true:
+ * <ul>
+ * <li>NFC(Fold(x)) = NFC(Fold(NFC(Fold(x))))
+ * <li>NFD(Fold(x)) = NFD(Fold(NFD(Fold(x))))
+ * <li>NFKC(Fold(x)) = NFKC(Fold(NFKC(Fold(x))))
+ * <li>NFKD(Fold(x)) = NFKD(Fold(NFKD(Fold(x))))
+ * </ul>
+ * </p>
+ * <p>
+ * This filter does not preserve normalization forms. Instead, suggested usage
+ * is to first case-fold, then normalize.
+ * </p>
+ * For more details, see Unicode Standard sections 5.18: <a
+ * href="http://www.unicode.org/versions/Unicode5.0.0/ch05.pdf#G21790">Caseless
+ * Matching</a> and 3.13: <a
+ * href="http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf#G34145">Default
+ * Caseless Matching</a>
+ */
+
+public final class ICUCaseFoldingFilter extends TokenFilter {
+  // this is true for NFKC or NFKD. When set, the FC_NFKC_Closure mappings from
+  // DerivedNormalizationProps.txt will be applied.
+  private final boolean nfkcClosure;
+
+  // case-folding output buffer, will be resized if necessary.
+  private char buffer[] = new char[4096];
+
+  // new api term attribute, will be updated with folded text.
+  private TermAttribute termAtt;
+
+  /**
+   * Create a new ICUCaseFoldingFilter, operating on the provided input stream.
+   * Output will be case-folded and closed under the supplied normalization
+   * mode.
+   * 
+   * @param input {@link TokenStream} to filter
+   * @param mode Normalization mode hint, will ensure output is closed under
+   *        that mode.
+   */
+  public ICUCaseFoldingFilter(TokenStream input, Normalizer.Mode mode) {
+    super(input);
+    nfkcClosure = (mode == Normalizer.NFKC || mode == Normalizer.NFKD);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  /*
+   * Algorithm: Up-front, the set of BMP codepoints that are case-insensitive
+   * are calculated. For these, and for basic latin, perform simple lower-case,
+   * as its equivalent to case-folding.
+   * 
+   * If a surrogate or case-sensitive (outside of basic latin) character is
+   * encountered, bail and proceed down the slower path.
+   */
+
+  public boolean incrementToken() throws IOException {
+
+    if (input.incrementToken()) {
+      final char src[] = termAtt.termBuffer();
+      final int length = termAtt.termLength();
+
+      for (int i = 0; i < length; i++) {
+        final char ch = src[i];
+
+        if (ch <= 0x7F /* basic latin */
+            || (!UTF16.isSurrogate(ch) && insensitive.contains(ch))) {
+          src[i] = Character.toLowerCase(ch); // either basic latin or a
+                                              // BMP-case-insensitive codepoint
+        } else {
+
+          /*
+           * a case-sensitive codepoint outside of basic latin, or a surrogate
+           * has been encountered. bail out completely and invoke the 'slow'
+           * case folding algorithm.
+           */
+
+          final int requiredLength = length << 2; // Max expansion factor: 3x
+                                                  // for case folding. 4x for
+                                                  // case folding + NFKC closure
+
+          if (buffer.length < requiredLength)
+            buffer = new char[ArrayUtil.getNextSize(requiredLength)];
+
+          final int newLength = fold(src, length, buffer, nfkcClosure);
+          termAtt.setTermBuffer(buffer, 0, newLength);
+          return true;
+        }
+      }
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /*
+   * What ICU should have, some way to do full case folding on char[] Behind the
+   * scenes this is still bad for the single-many case (look at UCaseProps.java)
+   * For really good performance, its not hard to create IntTrie data files from
+   * CaseFolding.txt and DerivedNormalizationProps.txt But this would introduce
+   * a maintenance hassle: would have to update data files whenever the Unicode
+   * standard is updated. Finally, there is a ticket to improve this in ICU:
+   * http://bugs.icu-project.org/trac/ticket/5072
+   * 
+   * Add a function that performs string normalization and case folding
+   * according to the Unicode Standard, chapter 5 "Implementation Guidelines",
+   * section "Case Mappings". The availability of such a function would help
+   * avoid errors by users who are unaware of the complications involved. There
+   * should be a parameter for the output normalization form. For "K" forms, the
+   * FC_NFKC_Closure should be applied as well.
+   * 
+   * Until this functionality is implemented, this looks to be the best overall
+   * tradeoff. When it is, the logic below should be removed!
+   */
+
+  // Low-level unicode case properties functionality
+  private static final UCaseProps caseProps;
+
+  /*
+   * Output buffer for when a single codepoint folds to multiple codepoints,
+   * required by UCaseProps. Fortunately, this is only used for the case where
+   * the length of a string is increased. Unfortunately, behind the scenes
+   * UCaseProps appends to this buffer with 'new String(xxx)'...
+   */
+  private final StringBuffer foldOut = new StringBuffer(4);
+
+  // Output buffer for when a codepoint has an FC_NFKC_Closure mapping
+  private final char replacement[] = new char[4];
+
+  // set of case-insensitive BMP characters.
+  private static final UnicodeSet insensitive;
+
+  static {
+
+    /*
+     * In the ICU UCharacter implementation there is some logic here involving
+     * getDummy(). This is supposedly for the case in which the case-properties
+     * data cannot be loaded into memory.
+     * 
+     * In this case, there are bigger problems if there isn't enough memory for
+     * the JVM to even lowercase!
+     */
+
+    try {
+      caseProps = UCaseProps.getSingleton();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+
+    /*
+     * Perhaps too conservative, but still over 60,000 BMP code points. This is
+     * nice because it is correct even if FC_NFKC_Closure mapping is being
+     * applied (NFKD/NFKC case) If the codepoint has NFKC_Quick_Check=Yes, then
+     * it cannot have an FC_NFKC_Closure mapping. Substract from this set any of
+     * these that are case-sensitive.
+     */
+
+    insensitive = new UnicodeSet(
+        "[[:NFKC_Quick_Check=Yes:]-[:Case_Sensitive=True:]]");
+    insensitive.compact();
+    insensitive.freeze();
+  }
+
+  /**
+   * Folds characters from src to dst, optionally applying FC_NFKC_Closure
+   * mappings. The output buffer must have enough storage to contain the folded
+   * text.
+   * 
+   * There is no such maximum limit specified in the standard, but a maximum
+   * expansion factor of 3x is specified here:
+   * http://unicode.org/reports/tr36/tr36-6.html#Buffer_Overflows
+   * 
+   * When applying FC_NFKC_Closure mappings, this expansion factor increases to
+   * 4x.
+   * 
+   * @param src input buffer
+   * @param length input length
+   * @param dst output buffer
+   * @param closure true if FC_NFKC_Closure mappings should be applied
+   * @return length of folded output.
+   */
+  private int fold(char src[], int length, char dst[], boolean closure) {
+    int codepoint;
+    int folded;
+    int closed;
+    int newlen = 0;
+
+    for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) {
+      codepoint = UTF16.charAt(src, 0, length, i);
+
+      /*
+       * If closure mappings are requested, and a mapping is applied, then
+       * case-folding is complete for this codepoint.
+       */
+
+      if (closure) {
+        closed = Normalizer.getFC_NFKC_Closure(codepoint, replacement);
+        if (closed > 0) { // apply the closure mapping, no need to apply
+                          // toFullFolding.
+          System.arraycopy(replacement, 0, dst, newlen, closed);
+          newlen += closed;
+          continue;
+        }
+      }
+
+      /*
+       * The UCaseProps toFullFolding has several possible return values for
+       * input codepoint x: ~x: This means x is already case-folded. x <=
+       * MAX_STRING_LENGTH (31): This means x maps to multiple codepoints, which
+       * are placed in the supplied StringBuffer. In this case x represents the
+       * length of that sequence. x > MAX_STRING_LENGTH: x is the case-folded
+       * single-codepoint result.
+       */
+
+      folded = caseProps.toFullFolding(codepoint, foldOut,
+          UCharacter.FOLD_CASE_DEFAULT);
+
+      if (folded < 0) { // codepoint is already folded
+        newlen += UCharacter.toChars(codepoint, dst, newlen);
+      } else if (folded <= UCaseProps.MAX_STRING_LENGTH) { // codepoint folds to
+                                                           // multiple
+                                                           // codepoints
+        foldOut.getChars(0, folded, dst, newlen);
+        foldOut.setLength(0);
+        newlen += folded;
+      } else if (folded < UTF16.SUPPLEMENTARY_MIN_VALUE) { // an attempt to
+                                                           // optimize the BMP
+                                                           // case.
+        dst[newlen++] = (char) folded;
+      } else {
+        newlen += UCharacter.toChars(folded, dst, newlen);
+      }
+    }
+
+    return newlen;
+  }
+
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUCaseFoldingFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUDigitFoldingFilter.java	(revision 0)
@@ -0,0 +1,113 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * A {@link TokenFilter} that folds numeric digits to their ASCII form.
+ * <p>
+ * There are many different representations of numeric digits for different
+ * scripts in Unicode. This filter folds numeric digits to their ASCII form.
+ * </p>
+ * For example, '৭০৬' will be folded to '706'
+ * 
+ */
+
+public final class ICUDigitFoldingFilter extends TokenFilter {
+
+  private char buffer[] = new char[4096];
+
+  private TermAttribute termAtt;
+
+  public ICUDigitFoldingFilter(TokenStream input) {
+    super(input);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  /*
+   * Algorithm: Native digits are typically rare, as are surrogates... Fast path
+   * is to simply verify the text has no surrogates or digits outside of basic
+   * latin.
+   * 
+   * If a surrogate or native digit (outside of basic latin) character is
+   * encountered, bail and proceed down the slower path.
+   */
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final char src[] = termAtt.termBuffer();
+      final int length = termAtt.termLength();
+
+      for (int i = 0; i < length; i++) {
+        final char ch = src[i];
+
+        if (ch > 0x7F && (UTF16.isSurrogate(ch) || UCharacter.isDigit(ch))) {
+
+          /*
+           * char is a surrogate or digit outside of basic latin bail out
+           * completely and invoke the slow folding algorithm
+           */
+
+          if (buffer.length < length)
+            buffer = new char[ArrayUtil.getNextSize(length)];
+
+          final int newLength = foldNumerics(src, length, buffer);
+
+          termAtt.setTermBuffer(buffer, 0, newLength);
+          return true;
+        }
+      }
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /*
+   * There are no closure issues here. For all codepoints, NFKC(foldNumeric(x))
+   * = NFKC(foldNumeric(NFKC(foldNumeric(x))))
+   */
+
+  private int foldNumerics(char src[], int length, char dst[]) {
+    int codepoint;
+    int folded;
+    int newlen = 0;
+
+    for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) {
+      codepoint = UTF16.charAt(src, 0, length, i);
+
+      if (UCharacter.isDigit(codepoint))
+        folded = UCharacter.forDigit(UCharacter.getNumericValue(codepoint), 10);
+      else
+        folded = codepoint;
+
+      newlen += UCharacter.toChars(folded, dst, newlen);
+    }
+    return newlen;
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUDigitFoldingFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUFormatFilter.java	(revision 0)
@@ -0,0 +1,119 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A {@link TokenFilter} that removes codepoints that affect the formatting and
+ * display of text.
+ * <p>
+ * Some codepoints in Unicode exist only to alter the formatting or display of
+ * text. This filter removes Format and Variation Selector codepoints.
+ * </p>
+ * TODO: more docs
+ */
+public final class ICUFormatFilter extends TokenFilter {
+
+  private char buffer[] = new char[4096];
+
+  private TermAttribute termAtt;
+
+  public ICUFormatFilter(TokenStream input) {
+    super(input);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  /*
+   * Algorithm: Format/Variation Selectors are typically rare, as are
+   * surrogates... Fast path is to simply verify the text has no surrogates or
+   * Format/Variation Selectors
+   * 
+   * If a surrogate or Format/Variation character is encountered, bail and
+   * proceed down the slower path.
+   * 
+   * The basic latin range is excluded because it contains no format codepoints,
+   * and because its much faster to exclude it than to execute the
+   * UnicodeSet.contains() binary search.
+   */
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final char src[] = termAtt.termBuffer();
+      final int length = termAtt.termLength();
+
+      for (int i = 0; i < length; i++) {
+        final char ch = src[i];
+
+        if (ch > 0x7F && (UTF16.isSurrogate(ch) || formatSet.contains(ch))) {
+
+          /*
+           * char is a surrogate or format codepoint bail out completely and
+           * invoke the slow removal algorithm
+           */
+
+          if (buffer.length < length)
+            buffer = new char[ArrayUtil.getNextSize(length)];
+
+          final int newLength = removeFormat(src, length, buffer);
+
+          termAtt.setTermBuffer(buffer, 0, newLength);
+          return true;
+        }
+      }
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  private static final UnicodeSet formatSet;
+
+  static {
+    formatSet = new UnicodeSet(
+        "[[:General_Category=Format:][:Variation_Selector=True:]]");
+    formatSet.compact();
+    formatSet.freeze();
+  }
+
+  /*
+   * TODO: validate closure for NFKC/NFKD
+   */
+  private static int removeFormat(char src[], int length, char dst[]) {
+    int codepoint;
+    int newlen = 0;
+
+    for (int i = 0; i < length; i += UTF16.getCharCount(codepoint)) {
+      codepoint = UTF16.charAt(src, 0, length, i);
+
+      if (!formatSet.contains(codepoint))
+        newlen += UCharacter.toChars(codepoint, dst, newlen);
+    }
+    return newlen;
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUFormatFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUNormalizationFilter.java	(revision 0)
@@ -0,0 +1,249 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+import org.apache.lucene.analysis.ASCIIFoldingFilter; // for javadoc only, example of a filter that does not respect canonical equivalence.
+
+import com.ibm.icu.text.Normalizer;
+
+/**
+ * A {@link TokenFilter} that performs Unicode text normalization.
+ * <p>
+ * Normalization standardizes different forms of the same character in Unicode.
+ * For any search application, it is essential to apply normalization to at
+ * least ensure canonical equivalence. For example, a Vietnamese input method on
+ * one operating system might represent the character ằ as one codepoint: LATIN
+ * SMALL LETTER A WITH BREVE AND GRAVE, whereas the input method on another
+ * operating system might represent the same character as two: LATIN SMALL
+ * LETTER A WITH BREVE followed by COMBINING GRAVE ACCENT. Unless text is
+ * normalized to a standard form, queries and documents from these different
+ * systems will not be interpreted as the same character!
+ * </p>
+ * <p>
+ * There are four modes that text can be normalized to:
+ * <ul>
+ * <li>NFD: Canonical Decomposition
+ * <li>NFC: Canonical Decomposition, followed by Canonical Composition
+ * <li>NFKD: Compatibility Decomposition
+ * <li>NFKC: Compatibility Decomposition, followed by Canonical Composition
+ * </ul>
+ * </p>
+ * <p>
+ * For most search tasks, it makes sense to normalize to NFC or NFKC, as the
+ * composed form will generally be shorter than the decomposed form. The
+ * decomposed forms can still be useful for some tasks, for example providing
+ * decomposed Korean text to a downstream {@link TokenFilter} would allow that
+ * filter to work with individual Jamo instead of composed Hangul syllables.
+ * </p>
+ * <p>
+ * For a typical search application, the way in which the text will be
+ * standardized is the most important, and the two types of standardization are
+ * described below.
+ * </p>
+ * <ul>
+ * <li>Under normalization forms NFC and NFD, characters that are canonical
+ * equivalents will be standardized.
+ * <p>
+ * Canonical equivalence is where there are multiple ways to encode the same
+ * character or sequence of characters in Unicode. These differences display the
+ * same to a user, but are different to the computer.
+ * </p>
+ * <p>
+ * For example, é can be encoded in Unicode in at least two different ways:
+ * <ul>
+ * <li>U+00E9 [LATIN SMALL LETTER E WITH ACUTE]
+ * <li>U+0065 U+0301 [LATIN SMALL LETTER E] [COMBINING ACUTE ACCENT]
+ * </ul>
+ * </p>
+ * <br>
+ * <li>Under normalization forms NFKC and NFKD, characters that are canonical or
+ * compatibility equivalents will be standardized.
+ * <p>
+ * Compatibility equivalence is a weaker form of equivalence than canonical
+ * equivalence. Similar to canonical equivalents, compatibility equivalents are
+ * different ways to represent the same character. The difference is that unlike
+ * canonical equivalents, compatibility equivalents may have different visual
+ * appearance or format.
+ * </p>
+ * <p>
+ * For example, the letter Ａ appears different than A, because of its width. The
+ * below two forms are not canonical equivalents, but are compatibility
+ * equivalents:
+ * <ul>
+ * <li>U+FF21 [FULLWIDTH LATIN CAPITAL LETTER A]
+ * <li>U+0041 [LATIN CAPITAL LETTER A]
+ * </ul>
+ * </p>
+ * <br>
+ * </ul>
+ * <p>
+ * Normalization is computationally expensive and can both reorder characters
+ * and change the length of text. In practice, typically the majority of text is
+ * already normalized. This filter first performs a quick-check, and performs
+ * normalization only when this quick-check fails or is uncertain.
+ * </p>
+ * <p>
+ * When designing an analysis pipeline, it is important to minimize the number
+ * of times you invoke ICUNormalizationFilter. At the same time, it is equally
+ * important that the analysis process behaves in such a way that all equivalent
+ * text is treated the same. The naïve solution to this problem is to invoke
+ * ICUNormalizationFilter both before and after every {@link TokenFilter} in the
+ * pipeline. This ensures that all equivalent text is treated the same and
+ * remains normalized, but is very inefficient. <br>
+ * Instead the two simple rules below can be followed to minimize the number of
+ * invocations:
+ * </p>
+ * <p>
+ * <b>If a {@link TokenFilter} does not <i>respect</i> the equivalence defined
+ * for the normalization form, ICUNormalizationFilter must be called before that
+ * {@link TokenFilter}.</b> <br>
+ * This way, text is provided to that TokenFilter in a form that it understands,
+ * and will be processed correctly. For example, the {@link ASCIIFoldingFilter}
+ * does not respect canonical equivalence: it only folds precomposed
+ * character+accent combinations to an accent-free form. Because of this, the
+ * two forms of é listed in the example above will be treated differently; only
+ * one will have its accent mark removed! By invoking ICUNormalizationFilter
+ * with NFC first, you can ensure that both are treated the same; both will have
+ * their accent marks removed.
+ * </p>
+ * <p>
+ * <b>If a {@link TokenFilter} does not <i>preserve</i> the normalization form,
+ * ICUNormalizationFilter must be called at some point after that
+ * {@link TokenFilter} before indexing.</b> <br>
+ * When a {@link TokenFilter} modifies text, it might cause text to become
+ * denormalized. There are a number of ways this can happen, even concatenation
+ * of two normalized chunks of text can produce a denormalized result. For
+ * example, although the {@link ICUCaseFoldingFilter} respects both canonical
+ * and compatibility equivalence, it does not preserve normalization forms. By
+ * the first rule above, because it respects canonical equivalence, the
+ * ICUNormalizationFilter need not be invoked before it for normalization form
+ * NFC. But, because it does does not preserve normalization form NFC, the
+ * ICUNormalizationFilter must be invoked before indexing, or before any
+ * downstream TokenFilter that does not respect canonical equivalence, whichever
+ * comes first.
+ * </p>
+ * <p>
+ * It is generally more difficult to preserve normalization forms than it is to
+ * respect equivalence. Respecting equivalence is usually a simple matter of
+ * adding additional mappings. When designing an analysis pipeline, it is
+ * recommended that every {@link TokenFilter} respect equivalence, and at the
+ * end of the pipeline ICUNormalizationFilter can be invoked a single time.
+ * </p>
+ * For more details, see UAX #15: <a
+ * href="http://www.unicode.org/reports/tr15/">Unicode Normalization Forms</a>
+ */
+
+public final class ICUNormalizationFilter extends TokenFilter {
+  // the mode this normalizer uses
+  private final Normalizer.Mode mode;
+
+  // normalization output buffer, will be resized if needed.
+  private char buffer[] = new char[4096];
+
+  // new api term attribute, will be updated with normalized text if necessary.
+  private TermAttribute termAtt;
+
+  /**
+   * Create an ICUNormalizationFilter that normalizes text to the specified
+   * mode.
+   * 
+   * @param input {@link TokenStream} to filter
+   * @param mode Normalization mode to apply
+   */
+  public ICUNormalizationFilter(TokenStream input, Normalizer.Mode mode) {
+    super(input);
+    this.mode = mode;
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  public boolean incrementToken() throws IOException {
+
+    /*
+     * First do a quick-check (this will be the significant majority of text).
+     * If the text is already normalized, simply return it. Otherwise, normalize
+     * the text.
+     */
+
+    if (input.incrementToken()) {
+      final char src[] = termAtt.termBuffer();
+      final int length = termAtt.termLength();
+
+      /*
+       * This quick-check returns three possible values: YES, NO, or MAYBE. When
+       * it returns YES, the text is already normalized. When it returns NO, the
+       * text is definitely not normalized. When it returns MAYBE, the only way
+       * to determine if the text is normalized is to actually normalize it. See
+       * http://www.unicode.org/unicode/reports/tr15/tr15-23.html#Annex8
+       */
+
+      if (Normalizer.quickCheck(src, 0, length, mode, 0) == Normalizer.YES)
+        return true;
+
+      /*
+       * There are known maximum expansions for the different forms that could
+       * remove the loop/exception handling below. These may change in new
+       * versions of the Unicode standard, and are sometimes large. The loop is
+       * for simplicity and ease of maintenance; with a large default buffer
+       * size it should rarely execute more than once.
+       * 
+       * From http://unicode.org/reports/tr36/tr36-6.html#Buffer_Overflows: The
+       * very large factors in the case of NFKC/D are due to some extremely rare
+       * characters. Thus algorithms can use much smaller expansion factors for
+       * the typical cases as long as they have a fallback process that accounts
+       * for the possibility of these characters in data.
+       * 
+       * For example, under normalization forms NFKC or NFKD, ﷺ (FDFA, ARABIC
+       * LIGATURE SALLALLAHOU ALAYHE WASALLAM) will be expanded to صلى الله عليه
+       * وسلم
+       */
+
+      do {
+        try {
+
+          /*
+           * This method is documented in the public API to throw
+           * IndexOutOfBoundsException if there is not enough space. Its an
+           * unfortunate mechanism, it would be a lot nicer if instead it
+           * behaved like the ArabicShaping API, whereas instead it would return
+           * the necessary length, possibly more than the buffer supplied. This
+           * would simplify things, instead a call with a 0-length output buffer
+           * would return the necessary length.
+           */
+
+          final int newLength = Normalizer.normalize(src, 0, length, buffer, 0,
+              buffer.length, mode, 0);
+          termAtt.setTermBuffer(buffer, 0, newLength);
+          return true;
+        } catch (IndexOutOfBoundsException e) {
+          // technically, ICU encodes the necessary size as a String in the
+          // exception, but don't depend on that...
+          buffer = new char[ArrayUtil.getNextSize(buffer.length << 1)];
+        }
+      } while (true);
+    } else {
+      return false;
+    }
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUNormalizationFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/ICUTransformFilter.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/ICUTransformFilter.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/ICUTransformFilter.java	(revision 0)
@@ -0,0 +1,235 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+import org.apache.lucene.icu.ICUNormalizationFilter; // only used for javadoc.
+import org.apache.lucene.util.ArrayUtil;
+
+import com.ibm.icu.text.Replaceable;
+import com.ibm.icu.text.RuleBasedTransliterator; // only used for optimizing the transform, see below
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A {@link TokenFilter} that transforms text with ICU.
+ * <p>
+ * ICU provides text-transformation functionality via its Transliteration API.
+ * Although script conversion is its most common use, a transliterator can
+ * actually perform a more general class of tasks. In fact, Transliterator
+ * defines a very general API which specifies only that a segment of the input
+ * text is replaced by new text. The particulars of this conversion are
+ * determined entirely by subclasses of Transliterator.
+ * </p>
+ * <p>
+ * Some useful transformations for search are built-in:
+ * <ul>
+ * <li>Conversion from Traditional to Simplified Chinese characters
+ * <li>Conversion from Hiragana to Katakana
+ * <li>Conversion from Fullwidth to Halfwidth forms.
+ * <li>Script conversions, for example Serbian Cyrillic to Latin
+ * </ul>
+ * For more advanced cases, the following capabilities might also be of use:
+ * <ul>
+ * <li>Conversion of Thai text from glyphic order to logical order for internal
+ * processing.
+ * <li>Romanization of text, or conversion between the different Indic scripts.
+ * <li>Creation of custom rules specific to your application's needs.
+ * </ul>
+ * </p>
+ * <p>
+ * Example usage: <blockquote>stream = new ICUTransformFilter(stream,
+ * Transliterator.getInstance("Traditional-Simplified"));</blockquote>
+ * </p>
+ * <p>
+ * Whether or not this filter respects equivalence or preserves normalization
+ * forms depends entirely upon the ruleset being applied.
+ * </p>
+ * <p>
+ * For good performance, it is helpful to declare a filter in any custom
+ * transform you build. This allows the transform to efficiently skip over
+ * unaffected text. It is also useful to consider if there are simpler
+ * solutions. For example, if you want to standardize Fullwidth and Halfwidth
+ * forms, use of {@link ICUNormalizationFilter} with compatibility decomposition
+ * will erase width differences, with better performance.
+ * </p>
+ * For more details, see the <a
+ * href="http://userguide.icu-project.org/transforms/general">ICU User
+ * Guide</a>.
+ */
+
+public final class ICUTransformFilter extends TokenFilter {
+  // Transliterator to transform the text
+  private final Transliterator transform;
+
+  // Reusable position object
+  private final Transliterator.Position position = new Transliterator.Position();
+
+  // Wraps a termAttribute around the replaceable interface.
+  private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
+
+  // new api term attribute, will be updated with transformed text.
+  private TermAttribute termAtt;
+
+  /**
+   * Create a new ICUTransformFilter that transforms text on the given stream.
+   * 
+   * @param input {@link TokenStream} to filter.
+   * @param transform Transliterator to transform the text.
+   */
+  public ICUTransformFilter(TokenStream input, Transliterator transform) {
+    super(input);
+    this.transform = transform;
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+
+    /*
+     * A good UnicodeFilter is vital for performance. Unfortunately, sometimes
+     * people omit filters in their rulesets. However, in the special case that
+     * the transform is a RuleBasedTransliterator, this situation can be
+     * corrected. It can only be applied to a pure RuleBasedTransliterator, and
+     * it is only applied when there is no supplied filter.
+     * 
+     * For a great example of a ruleset like this, see the built-in
+     * Simplified/Traditional ruleset from CLDR. This is a massive performance
+     * optimization for that case!
+     * 
+     * If CompoundTransliterator and its children were exposed (its
+     * package-private and children are inaccessible), then more cases could be
+     * optimized.
+     * 
+     * Regardless of who wrote the rules, you can ALWAYS apply a filter own your
+     * own:Transliterator.getInstance(
+     * "[:Arabic:] UnfilteredTransformThatOnlyProcessesTheArabicBlock");
+     * 
+     * Just be careful to ensure you don't filter characters that should be
+     * converted! This can be tricky if, for example, the transliterator
+     * internally invokes ::NFKC().
+     */
+
+    if (transform.getFilter() == null
+        && (transform instanceof RuleBasedTransliterator)) {
+      final UnicodeSet sourceSet = transform.getSourceSet();
+      if (sourceSet != null && !sourceSet.isEmpty())
+        transform.setFilter(sourceSet);
+    }
+  }
+
+  public boolean incrementToken() throws IOException {
+
+    /*
+     * Wrap the TermAttribute around the replaceable interface, clear the
+     * positions, and transliterate. Finally, update the TermAttribute with the
+     * [potentially different] length.
+     */
+
+    if (input.incrementToken()) {
+      final int length = termAtt.termLength();
+      replaceableAttribute.setText(termAtt);
+
+      position.start = 0;
+      position.limit = length;
+      position.contextStart = 0;
+      position.contextLimit = length;
+
+      transform.filteredTransliterate(replaceableAttribute, position, false);
+      termAtt.setTermLength(replaceableAttribute.length());
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  /**
+   * Wrap a {@link TermAttribute} with the Replaceable API.
+   * 
+   * This allows for ICU transforms to run without unnecessary object creation.
+   * 
+   * This wrapper does not keep the TermAttribute's length up to date at all
+   * times, when you are done you must finalize the replacement process by setting
+   * the TermAttribute's length.
+   */
+
+  final class ReplaceableTermAttribute implements Replaceable {
+    private char buffer[];
+
+    private int length;
+
+    private TermAttribute token;
+
+    ReplaceableTermAttribute() {
+    }
+
+    void setText(final TermAttribute reusableToken) {
+      this.token = reusableToken;
+      this.buffer = reusableToken.termBuffer();
+      this.length = reusableToken.termLength();
+    }
+
+    public int char32At(int pos) {
+      return UTF16.charAt(buffer, 0, length, pos);
+    }
+
+    public char charAt(int pos) {
+      return buffer[pos];
+    }
+
+    public void copy(int start, int limit, int dest) {
+      char text[] = new char[limit - start];
+      getChars(start, limit, text, 0);
+      replace(dest, dest, text, 0, limit - start);
+    }
+
+    public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
+      System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
+    }
+
+    public boolean hasMetaData() {
+      return false;
+    }
+
+    public int length() {
+      return length;
+    }
+
+    public void replace(int start, int limit, String text) {
+      replace(start, limit, text.toCharArray(), 0, text.length());
+    }
+
+    public void replace(int start, int limit, char[] text, int charsStart,
+        int charsLen) {
+      final int replacementLength = limit - start;
+      final int newLength = length - replacementLength + charsLen;
+      // resize if necessary
+      if (newLength > length)
+        buffer = token.resizeTermBuffer(ArrayUtil.getNextSize(newLength));
+      // if the substring being replaced is longer or shorter than the
+      // replacement, need to shift things around
+      if (replacementLength != charsLen && limit < length)
+        System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
+      // insert the replacement text
+      System.arraycopy(text, charsStart, buffer, start, charsLen);
+      length = newLength;
+    }
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\ICUTransformFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/package.html
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/package.html	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/package.html	(revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+An analyzer and set of analysis components for Unicode text, based on ICU.
+</body>
+</html>
\ No newline at end of file

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\package.html
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/BreakIteratorWrapper.java	(revision 0)
@@ -0,0 +1,181 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.DictionaryBasedBreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Contain all the issues surrounding BreakIterators in ICU in one place.
+ * Basically this boils down to the fact that they aren't very friendly to any
+ * sort of OO design.
+ * 
+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+ * BreakIterator from RuleBasedBreakIterator
+ * 
+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+ * doesn't actually behave as a subclass: it always returns 0 for
+ * getRuleStatus()
+ * 
+ * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+ * tags
+ * 
+ */
+abstract class BreakIteratorWrapper {
+  protected final CharArrayIterator textIterator = new CharArrayIterator();
+
+  protected char text[];
+
+  protected int start;
+
+  protected int length;
+
+  abstract int next();
+
+  abstract int current();
+
+  abstract int getRuleStatus();
+
+  abstract void setText(CharacterIterator text);
+
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.length = length;
+    textIterator.setText(text, start, length);
+    setText(textIterator);
+  }
+
+  /*
+   * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
+   * treat it like a generic BreakIterator If its any other
+   * RuleBasedBreakIterator, the rule status can be used for token type. If its
+   * any other BreakIterator, the rulestatus method is not available, so treat
+   * it like a generic BreakIterator.
+   */
+  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
+    if (breakIterator instanceof RuleBasedBreakIterator
+        && !(breakIterator instanceof DictionaryBasedBreakIterator))
+      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
+    else
+      return new BIWrapper(breakIterator);
+  }
+
+  /*
+   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
+   * a DictionaryBasedBreakIterator) behaves correctly.
+   */
+  static final class RBBIWrapper extends BreakIteratorWrapper {
+    private final RuleBasedBreakIterator rbbi;
+
+    RBBIWrapper(RuleBasedBreakIterator rbbi) {
+      this.rbbi = rbbi;
+    }
+
+    int current() {
+      return rbbi.current();
+    }
+
+    int getRuleStatus() {
+      return rbbi.getRuleStatus();
+    }
+
+    int next() {
+      return rbbi.next();
+    }
+
+    void setText(CharacterIterator text) {
+      rbbi.setText(text);
+    }
+  }
+
+  /*
+   * Generic BreakIterator wrapper: Either the rulestatus method is not
+   * available or always returns 0. Calculate a rulestatus here so it behaves
+   * like RuleBasedBreakIterator.
+   * 
+   * Note: This is slower than RuleBasedBreakIterator.
+   */
+  static final class BIWrapper extends BreakIteratorWrapper {
+    private final BreakIterator bi;
+
+    private int status;
+
+    BIWrapper(BreakIterator bi) {
+      this.bi = bi;
+    }
+
+    int current() {
+      return bi.current();
+    }
+
+    int getRuleStatus() {
+      return status;
+    }
+
+    int next() {
+      int current = bi.current();
+      int next = bi.next();
+      status = calcStatus(current, next);
+      return next;
+    }
+
+    private int calcStatus(int current, int next) {
+      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
+        return RuleBasedBreakIterator.WORD_NONE;
+
+      int begin = start + current;
+      int end = start + next;
+
+      /*
+       * TODO: Consider optimizing BMP with this idiom:
+       * http://www.icu-project.org/docs/papers/supplementaries_iuc21.ppt
+       * 
+       * for (int i = 0; i < s.length(); ++i) { int c = s.charAt(i); if (0xD800
+       * <= c && c <= 0xDBFF) { c = UTF16.charAt(s, i); i +=
+       * UTF16.getCharCount(c) - 1; } if (UCharacter.isLetter(c)) {
+       * doSomething(c); } }
+       */
+
+      int codepoint;
+      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
+        codepoint = UTF16.charAt(text, 0, end, begin);
+
+        if (UCharacter.isDigit(codepoint))
+          return RuleBasedBreakIterator.WORD_NUMBER;
+        else if (UCharacter.isLetter(codepoint)) {
+          // TODO try to separately specify ideographic, kana? [currently all
+          // bundled as letter for this case]
+          return RuleBasedBreakIterator.WORD_LETTER;
+        }
+      }
+
+      return RuleBasedBreakIterator.WORD_NONE;
+    }
+
+    void setText(CharacterIterator text) {
+      bi.setText(text);
+      status = RuleBasedBreakIterator.WORD_NONE;
+    }
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\BreakIteratorWrapper.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CharArrayIterator.java	(revision 0)
@@ -0,0 +1,117 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+/**
+ * Wraps a char[] as CharacterIterator for processing with a BreakIterator
+ * 
+ */
+final class CharArrayIterator implements CharacterIterator {
+  private char array[];
+  private int start;
+  private int index;
+  private int length;
+  private int limit;
+
+  public char [] getText() {
+    return array;
+  }
+  
+  public int getStart() {
+    return start;
+  }
+  
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param array text buffer to examine
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(final char array[], int start, int length) {
+    this.array = array;
+    this.start = start;
+    this.index = start;
+    this.length = length;
+    this.limit = start + length;
+  }
+
+  public char current() {
+    return (index == limit) ? DONE : array[index];
+  }
+
+  public char first() {
+    index = start;
+    return current();
+  }
+
+  public int getBeginIndex() {
+    return 0;
+  }
+
+  public int getEndIndex() {
+    return length;
+  }
+
+  public int getIndex() {
+    return index - start;
+  }
+
+  public char last() {
+    index = (limit == start) ? limit : limit - 1;
+    return current();
+  }
+
+  public char next() {
+    if (++index >= limit) {
+      index = limit;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char previous() {
+    if (--index < start) {
+      index = start;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char setIndex(int position) {
+    if (position < getBeginIndex() || position > getEndIndex())
+      throw new IllegalArgumentException("Illegal Position: " + position);
+    index = start + position;
+    return current();
+  }
+
+  public Object clone() {
+    CharArrayIterator clone = new CharArrayIterator();
+    clone.setText(array, start, length);
+    clone.index = index;
+    return clone;
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\CharArrayIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/CompositeBreakIterator.java	(revision 0)
@@ -0,0 +1,144 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+import java.util.Map;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * An internal BreakIterator for multilingual text, following recommendations
+ * from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
+ * 
+ * This Break Iterator can be customized in up to two ways: 1. Providing a
+ * custom default BreakIterator. 2. Providing a mapping of BreakIterators that
+ * work with specific scripts.
+ * 
+ * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
+ * design
+ * 
+ * Text is first divided into script boundaries. The processing is then
+ * delegated to the appropriate break iterator for that specific script. If no
+ * script-specific break iterator is available, the Unicode default bounds
+ * properties are used.
+ * 
+ * This break iterator also allows you to retrieve the ISO 15924 script code
+ * associated with a piece of text.
+ * 
+ * See also UAX #29, UTR #24
+ * 
+ */
+final class CompositeBreakIterator {
+  private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
+
+  private BreakIteratorWrapper rbbi;
+
+  private final ScriptIterator scriptIterator = new ScriptIterator();
+
+  private char text[];
+
+  CompositeBreakIterator(BreakIterator rbbi, Map scriptHandlers) {
+    // the default word breaker
+    BreakIteratorWrapper defaultBreaker = BreakIteratorWrapper.wrap(rbbi);
+    for (int i = 0; i < wordBreakers.length; i++)
+      wordBreakers[i] = defaultBreaker;
+
+    // register any custom script-specific word-breakers
+    for (Iterator iterator = scriptHandlers.entrySet().iterator(); iterator
+        .hasNext();) {
+      Map.Entry handler = (Map.Entry) iterator.next();
+      Integer scriptCode = (Integer) handler.getKey();
+      BreakIterator scriptHandler = (BreakIterator) handler.getValue();
+      wordBreakers[scriptCode.intValue()] = BreakIteratorWrapper
+          .wrap(scriptHandler);
+    }
+  }
+
+  /**
+   * Retrieve the next break position. If the RBBI range is exhausted within the
+   * script boundary, examine the next script boundary.
+   * 
+   * @return the next break position or BreakIterator.DONE
+   */
+  int next() {
+    int next = rbbi.next();
+    while (next == BreakIterator.DONE && scriptIterator.next()) {
+      rbbi = wordBreakers[scriptIterator.getScriptCode()];
+      rbbi.setText(text, scriptIterator.getScriptStart(), scriptIterator
+          .getScriptLimit()
+          - scriptIterator.getScriptStart());
+      next = rbbi.next();
+    }
+    return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
+        + scriptIterator.getScriptStart();
+  }
+
+  /**
+   * Retrieve the current break position.
+   * 
+   * @return the current break position or BreakIterator.DONE
+   */
+  int current() {
+    final int current = rbbi.current();
+    return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
+        + scriptIterator.getScriptStart();
+  }
+
+  /**
+   * Retrieve the rule status code (token type) from the underlying break
+   * iterator
+   * 
+   * @return rule status code (see RuleBasedBreakIterator constants)
+   */
+  int getRuleStatus() {
+    return rbbi.getRuleStatus();
+  }
+
+  /**
+   * Retrieve the UScript script code for the current token. This code can be
+   * decoded with UScript into a name or ISO 15924 code.
+   * 
+   * @return UScript script code for the current token.
+   */
+  int getScriptCode() {
+    return scriptIterator.getScriptCode();
+  }
+
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param text buffer of text
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(final char text[], int start, int length) {
+    this.text = text;
+    scriptIterator.setText(text, start, length);
+    if (scriptIterator.next()) {
+      rbbi = wordBreakers[scriptIterator.getScriptCode()];
+      rbbi.setText(text, scriptIterator.getScriptStart(), scriptIterator
+          .getScriptLimit()
+          - scriptIterator.getScriptStart());
+    } else {
+      rbbi = wordBreakers[UScript.COMMON];
+      rbbi.setText(text, 0, 0);
+    }
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\CompositeBreakIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ICUTokenizer.java	(revision 0)
@@ -0,0 +1,331 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Breaks text into words according to UAX #29: Unicode Text Segmentation
+ * (http://www.unicode.org/reports/tr29/)
+ * <p>
+ * Words are broken across script boundaries and unicode boundaries based upon
+ * their unicode properties.
+ * </p>
+ * TODO: more docs
+ */
+public final class ICUTokenizer extends Tokenizer {
+  /**
+   * Token type for words containing ideographic characters
+   */
+  public static final String WORD_IDEO = "IDEO";
+
+  /**
+   * Token type for words containing kana characters
+   */
+  public static final String WORD_KANA = "KANA";
+
+  /**
+   * Token type for words that contain letters, excluding hiragana, katakana, or
+   * ideographic characters.
+   */
+  public static final String WORD_LETTER = "WORD";
+
+  /**
+   * Token type for words that appear to be numbers.
+   */
+  public static final String WORD_NUMBER = "NUM";
+
+  private static final int IOBUFFER = 4096;
+
+  private final char buffer[] = new char[IOBUFFER];
+
+  private int length = 0; /* true length of text in the buffer */
+
+  private int usableLength = 0; /*
+                                 * length of text in the buffer that can be
+                                 * evaluated safely, up to a safe end point
+                                 */
+
+  private int offset = 0; /*
+                           * accumulated offset of previous buffers for this
+                           * Reader, for correct term offsets
+                           */
+
+  private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
+
+  private OffsetAttribute offsetAtt;
+  private TermAttribute termAtt;
+  private TypeAttribute typeAtt;
+  private FlagsAttribute flagsAtt;
+  private PositionIncrementAttribute posIncAtt;
+
+  /**
+   * Construct a new ICUTokenizer that breaks text into words from the given
+   * Reader.
+   * <p>
+   * Text will be broken into words by the Unicode text segmentation algorithm:
+   * BreakIterator.getWordInstance(ULocale.ROOT)
+   * </p>
+   * <p>
+   * The default script-specific handling is used: specifically Thai text is
+   * broken into words with a Thai DictionaryBasedBreakiterator
+   * </p>
+   * 
+   * @param input Reader containing text to tokenize.
+   */
+  public ICUTokenizer(Reader input) {
+    this(input, null, null);
+  }
+
+  /**
+   * Construct a new ICUTokenizer that breaks text into words from the given
+   * Reader, using a tailored default BreakIterator
+   * <p>
+   * The default script-specific handling is used: specifically Thai text is
+   * broken into words with a Thai DictionaryBasedBreakiterator
+   * </p>
+   * 
+   * @param input Reader containing text to tokenize.
+   * @param defaultBreakIterator Tailored BreakIterator that breaks text into
+   *        words.
+   */
+  public ICUTokenizer(Reader input, BreakIterator defaultBreakIterator) {
+    this(input, defaultBreakIterator, null);
+  }
+
+  /**
+   * Construct a new ICUTokenizer that breaks text into words from the given
+   * Reader, using a tailored default BreakIterator and a custom mapping of
+   * script-specific BreakIterators
+   * 
+   * @param input Reader containing text to tokenize.
+   * @param defaultBreakIterator Tailored BreakIterator that breaks text into
+   *        words.
+   * @param scriptHandlers A {@link Map} mapping UScript codes to BreakIterators
+   *        for script-specific handling.
+   */
+  public ICUTokenizer(Reader input, BreakIterator defaultBreakIterator,
+      Map scriptHandlers) {
+    super(input);
+
+    if (defaultBreakIterator == null)
+      defaultBreakIterator = (BreakIterator) rootBreakIterator.clone();
+
+    if (scriptHandlers == null) {
+      scriptHandlers = new HashMap();
+      /* Hebrew has special punctuation handling */
+      scriptHandlers.put(new Integer(UScript.HEBREW),
+          (BreakIterator) hebrewBreakIterator.clone());   
+      /* Thai has no visible word separation, use dictionary-based break iterator */
+      scriptHandlers.put(new Integer(UScript.THAI),
+          (BreakIterator) thaiBreakIterator.clone());
+      /* Khmer has no visible word separation, break into syllables */
+      scriptHandlers.put(new Integer(UScript.KHMER),
+          (BreakIterator) khmerBreakIterator.clone());
+      /* Lao has no visible word separation, break into syllables */
+      scriptHandlers.put(new Integer(UScript.LAO),
+          (BreakIterator) laoBreakIterator.clone());
+      /* Myanmar has no visible word separation, break into syllables */
+      scriptHandlers.put(new Integer(UScript.MYANMAR),
+          (BreakIterator) myanmarBreakIterator.clone());
+    }
+
+    breaker = new CompositeBreakIterator(defaultBreakIterator, scriptHandlers);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+    flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+    posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+  }
+
+  /**
+   * Return the token's lexical type.
+   * <p>
+   * For a tailored ruleset, you may want to override this method to return
+   * additional types.
+   * </p>
+   * 
+   * @return String containing the token type
+   */
+  protected String getType() {
+    switch (breaker.getRuleStatus()) {
+      case RuleBasedBreakIterator.WORD_IDEO:
+        return ICUTokenizer.WORD_IDEO;
+      case RuleBasedBreakIterator.WORD_KANA:
+        return ICUTokenizer.WORD_KANA;
+      case RuleBasedBreakIterator.WORD_LETTER:
+        return ICUTokenizer.WORD_LETTER;
+      case RuleBasedBreakIterator.WORD_NUMBER:
+        return ICUTokenizer.WORD_NUMBER;
+      default: /* RuleBasedBreakIterator.WORD_NONE, not possible */
+        return "NONE";
+    }
+  }
+
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (length == 0)
+      refill();
+    while (!incrementTokenBuffer()) {
+      refill();
+      if (length <= 0) // no more bytes to read;
+        return false;
+    }
+    return true;
+  }
+
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    breaker.setText(buffer, 0, 0);
+    length = usableLength = offset = 0;
+  }
+
+  /*
+   * The method is as follows:
+   * 
+   * The ICU RBBI implementation for the ROOT locale is used. This tokenizes
+   * text based upon the longest matching rule, and because of this, isn't
+   * friendly to a Reader.
+   * 
+   * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
+   * text, the last unambiguous break point is found (in this implementation:
+   * white space character) Any remaining characters represent possible partial
+   * words, so are appended to the front of the next chunk.
+   * 
+   * There is the possibility that there are no unambiguous break points within
+   * an entire 4kB chunk of text (binary data) Currently, this could cause what
+   * should really be an 8kB word to be parsed as two 4kB words instead. This
+   * keeps the code simple, but other possibilities could be: A. increasing the
+   * buffer size and reading more data, hoping to find an unambiguous boundary,
+   * at the risk of OOM error. B. discarding the chunk entirely and instead
+   * looking for a set of unambiguous start-end boundaries in future text.
+   * 
+   * Option B is closer to what other Lucene analyzers do, but would increase
+   * code complexity significantly.
+   */
+
+  /**
+   * Returns the last unambiguous break position in the text.
+   * 
+   * @return position of character, or -1 if one does not exist
+   */
+  private int findSafeEnd() {
+    for (int i = length - 1; i >= 0; i--)
+      if (UCharacter.isWhitespace(buffer[i]))
+        return i + 1;
+    return -1;
+  }
+
+  /**
+   * Refill the buffer, accumulating the offset and setting usableLength to the
+   * last unambiguous break position
+   * 
+   * @throws IOException
+   */
+  private void refill() throws IOException {
+    offset += usableLength;
+    int leftover = length - usableLength;
+    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+    int requested = buffer.length - leftover;
+    int returned = input.read(buffer, leftover, requested);
+    length = returned < 0 ? leftover : returned + leftover;
+    if (returned < requested) /* reader has been emptied, process the rest */
+      usableLength = length;
+    else { /* still more data to be read, find a safe-stopping place */
+      usableLength = findSafeEnd();
+      if (usableLength < 0)
+        usableLength = length; /*
+                                * more than IOBUFFER of text without space,
+                                * gonna possibly truncate tokens
+                                */
+    }
+
+    breaker.setText(buffer, 0, Math.max(0, usableLength));
+  }
+
+  /*
+   * return true if there is a token from the buffer, or null if it is
+   * exhausted.
+   */
+  private boolean incrementTokenBuffer() {
+    int start = breaker.current();
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
+
+    // find the next set of boundaries, skipping over non-tokens (rule status 0)
+    int end = breaker.next();
+    while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+      start = end;
+      end = breaker.next();
+    }
+
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
+
+    int length = end - start;
+    termAtt.setTermBuffer(buffer, start, length);
+    offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
+    typeAtt.setType(getType());
+    flagsAtt.setFlags(breaker.getScriptCode());
+
+    return true;
+  }
+  
+  /*
+   * the default breakiterators in use. these can be expensive to
+   * instantiate, cheap to clone.
+   */
+  private static final BreakIterator rootBreakIterator = BreakIterator
+      .getWordInstance(ULocale.ROOT);
+
+  private static final BreakIterator thaiBreakIterator = BreakIterator
+      .getWordInstance(new ULocale("th_TH"));
+  
+  private static final BreakIterator hebrewBreakIterator = readBreakIterator("Hebrew.brk");
+  private static final BreakIterator khmerBreakIterator = readBreakIterator("Khmer.brk");
+  private static final BreakIterator laoBreakIterator = new LaoBreakIterator(readBreakIterator("Lao.brk"));
+  private static final BreakIterator myanmarBreakIterator = readBreakIterator("Myanmar.brk");
+  
+  private static RuleBasedBreakIterator readBreakIterator(String filename) {
+    InputStream is = ICUTokenizer.class.getResourceAsStream(filename);
+    try {
+      RuleBasedBreakIterator bi = RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
+      is.close();
+      return bi;
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\ICUTokenizer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/LaoBreakIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/LaoBreakIterator.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/LaoBreakIterator.java	(revision 0)
@@ -0,0 +1,167 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UnicodeSet;
+
+public class LaoBreakIterator extends BreakIterator {
+  RuleBasedBreakIterator rules;
+  CharArrayIterator text;
+  
+  CharArrayIterator working = new CharArrayIterator();
+  int workingOffset = 0;
+  
+  CharArrayIterator verifyText = new CharArrayIterator();
+  RuleBasedBreakIterator verify;
+  
+  private static final UnicodeSet laoSet;
+  static {
+    laoSet = new UnicodeSet("[:Lao:]");
+    laoSet.compact();
+    laoSet.freeze();
+  }
+  
+  public LaoBreakIterator(RuleBasedBreakIterator rules) {
+    this.rules = rules;
+    this.verify = (RuleBasedBreakIterator) rules.clone();
+  }
+
+  public int current() {
+    int current = rules.current();
+    return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
+  }
+
+  public int first() {
+    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+    rules.setText(working);
+    workingOffset = 0;
+    int first = rules.first();
+    return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
+  }
+
+  public int following(int offset) {
+    // TODO Auto-generated method stub
+    return 0;
+  }
+
+  public CharacterIterator getText() {
+    return text;
+  }
+
+  public int last() {
+    // TODO Auto-generated method stub
+    return 0;
+  }
+  
+  public int next() {
+    int current = current();
+    int next = rules.next();
+    if (next == BreakIterator.DONE)
+      return next;
+    else
+      next += workingOffset;
+    
+    char c = working.current();
+    int following = rules.next(); // lookahead
+    if (following != BreakIterator.DONE) {
+      following += workingOffset;
+      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
+        workingOffset = next - 1;
+        working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
+        return next - 1;
+      }
+    rules.previous(); // undo the lookahead
+    }
+    
+    return next;
+  }
+
+  public int next(int n) {
+    if (n < 0)
+      throw new UnsupportedOperationException("Backwards traversal is unsupported");
+
+    int result = current();
+    while (n > 0) {
+        result = next();
+        --n;
+    }
+    return result;
+  }
+
+  public int previous() {
+    throw new UnsupportedOperationException("Backwards traversal is unsupported");
+  }
+
+  public void setText(CharacterIterator text) {
+    if (!(text instanceof CharArrayIterator))
+      throw new UnsupportedOperationException("unsupported CharacterIterator");
+    this.text = (CharArrayIterator) text;
+    ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
+    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+    rules.setText(working);
+    workingOffset = 0;
+  }
+  
+  public void setText(String newText) {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText(newText.toCharArray(), 0, newText.length());
+    setText(ci);
+  }
+  
+  private boolean verifyPushBack(int current, int next) {
+    int shortenedSyllable = next - current - 1;
+
+    verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
+    verify.setText(verifyText);
+    if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
+      return false;
+    
+
+    verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
+    verify.setText(verifyText);
+
+    return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
+  }
+
+  // TODO: only bubblesort around runs of combining marks, instead of the entire text.
+  private void ccReorder(char[] text, int start, int length) {
+    int prevCC = 0;
+    boolean reordered;
+    do {
+      reordered = false;
+      for (int i = start; i < start + length; i++) {
+        final char c = text[i];
+        final int cc = UCharacter.getCombiningClass(c);
+        if (cc > 0 && cc < prevCC) {
+          // swap
+          text[i] = text[i - 1];
+          text[i - 1] = c;
+          reordered = true;
+        } else {
+          prevCC = cc;
+        }
+      }
+
+    } while (reordered == true);
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\LaoBreakIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/ScriptIterator.java	(revision 0)
@@ -0,0 +1,183 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * An iterator that locates ISO 15924 script boundaries in text. This is not the
+ * same as simply looking at the Unicode block, or even the Script property.
+ * Some characters are 'common' across multiple scripts, and some 'inherit' the
+ * script value of text surrounding them.
+ * 
+ * This is similar to ICU (internal-only) UScriptRun, with the following
+ * differences:
+ * 
+ * Doesn't attempt to match paired punctuation. For tokenization purposes, this
+ * is not necessary. Its also quite expensive. Non-spacing marks inherit the
+ * script of their base character, following recommendations from UTR #24.
+ * 
+ * TODO: Runs of Han ideographs and Hiragana/Katakana need to be mapped to
+ * Japanese script. TODO: Runs of Han ideographs and Hangul need to be mapped to
+ * Korean script.
+ */
+final class ScriptIterator {
+  private char text[];
+
+  private int start;
+
+  private int limit;
+
+  private int index;
+
+  private int scriptStart;
+
+  private int scriptLimit;
+
+  private int scriptCode;
+
+  /**
+   * Get the start of this script run
+   * 
+   * @return start position of script run
+   */
+  int getScriptStart() {
+    return scriptStart;
+  }
+
+  /**
+   * Get the index of the first character after the end of this script run
+   * 
+   * @return position of the first character after this script run
+   */
+  int getScriptLimit() {
+    return scriptLimit;
+  }
+
+  /**
+   * Get the UScript script code for this script run
+   * 
+   * @return code for the script of the current run
+   */
+  int getScriptCode() {
+    return scriptCode;
+  }
+
+  /**
+   * Iterates to the next script run, returning true if one exists.
+   * 
+   * @return true if there is another script run, false otherwise.
+   */
+  boolean next() {
+    if (scriptLimit >= limit)
+      return false;
+
+    scriptCode = UScript.COMMON;
+    scriptStart = scriptLimit;
+
+    /*
+     * TODO: Consider optimizing BMP with this idiom:
+     * http://www.icu-project.org/docs/papers/supplementaries_iuc21.ppt
+     * 
+     * for (int i = 0; i < s.length(); ++i) { int c = s.charAt(i); if (0xD800 <=
+     * c && c <= 0xDBFF) { c = UTF16.charAt(s, i); i += UTF16.getCharCount(c) -
+     * 1; } if (UCharacter.isLetter(c)) { doSomething(c); } }
+     */
+
+    while (index < limit) {
+      final int ch = UTF16.charAt(text, start, limit, index - start);
+      final int sc = getScript(ch);
+
+      /*
+       * From UTR #24: Implementations that determine the boundaries between
+       * characters of given scripts should never break between a non-spacing
+       * mark and its base character. Thus for boundary determinations and
+       * similar sorts of processing, a non-spacing mark — whatever its script
+       * value — should inherit the script value of its base character.
+       */
+
+      if (isSameScript(scriptCode, sc)
+          || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
+        index += UTF16.getCharCount(ch);
+
+        /*
+         * Inherited or Common becomes the script code of the surrounding text.
+         */
+
+        if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
+          scriptCode = sc;
+        }
+
+      } else {
+        break;
+      }
+    }
+
+    scriptLimit = index;
+    return true;
+  }
+
+  /*
+   * Determine if two scripts are compatible.
+   */
+  private static boolean isSameScript(int scriptOne, int scriptTwo) {
+    return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
+        || scriptOne == scriptTwo;
+  }
+
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param text text buffer to examine
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.index = start;
+    this.limit = start + length;
+    this.scriptStart = start;
+    this.scriptLimit = start;
+    this.scriptCode = UScript.INVALID_CODE;
+  }
+
+  /*
+   * linear array access fast-path for basic latin case, greatest 128 ints ever
+   * spent
+   */
+  private static final int basicLatin[] = new int[128];
+
+  static {
+    for (int i = 0; i < basicLatin.length; i++)
+      basicLatin[i] = UScript.getScript(i);
+  }
+
+  /*
+   * An accelerated version of UScript.getScript() Basic latin is an array
+   * lookup.
+   */
+  private static int getScript(int codepoint) {
+    if (0 <= codepoint && codepoint < basicLatin.length)
+      return basicLatin[codepoint];
+    else
+      return UScript.getScript(codepoint);
+  }
+}

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\ScriptIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html
===================================================================
--- contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html	(revision 0)
+++ contrib/icu/src/java/org/apache/lucene/icu/tokenizer/package.html	(revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm.
+</body>
+</html>
\ No newline at end of file

Property changes on: contrib\icu\src\java\org\apache\lucene\icu\tokenizer\package.html
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Hebrew.rbbi
===================================================================
--- contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Hebrew.rbbi	(revision 0)
+++ contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Hebrew.rbbi	(revision 0)
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# This is an example of rule tailoring for Hebrew.
+# In this example the single-quote is added to the Extend category
+# The double-quote is added to the MidLetter category.
+#
+!!chain;
+$CR           = [\p{Word_Break = CR}];
+$LF           = [\p{Word_Break = LF}];
+$Newline      = [\p{Word_Break = Newline}];
+$Extend       = [\p{Word_Break = Extend}\u0027];
+$Format       = [\p{Word_Break = Format}];
+$ALetter      = [\p{Word_Break = ALetter}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidLetter    = [\p{Word_Break = MidLetter}\u0022];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$dictionary   = [:LineBreak = Complex_Context:];
+$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]]; 
+                                                              
+$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+
+$CR $LF;
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
+$NumericEx {100};
+$ALetterEx {200};    
+$ALetterEx $ALetterEx {200};
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+$NumericEx $NumericEx {100};
+$ALetterEx $NumericEx {200};
+$NumericEx $ALetterEx {200};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$ALetterEx      $ExtendNumLetEx {200};    
+$NumericEx      $ExtendNumLetEx {100};      
+$ExtendNumLetEx $ExtendNumLetEx {200};    
+$ExtendNumLetEx $ALetterEx  {200};    
+$ExtendNumLetEx $NumericEx  {100};    
Index: contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Khmer.rbbi
===================================================================
--- contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Khmer.rbbi	(revision 0)
+++ contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Khmer.rbbi	(revision 0)
@@ -0,0 +1,70 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Khmer text, with orthographic syllable as token.
+# ICUBigramFilter will form bigrams of these syllables downstream.
+# This is an example of how one can use a tailored ruleset.
+# The definition of Khmer orthographic syllable is taken directly from the Unicode Standard.
+#
+!!chain;
+#
+# B = base character (consonant, independent vowel, etc)
+$KhmerBase = [\u1780-\u17B3];
+# R = robat
+$KhmerRobat = [\u17CC];
+# C = consonant shifter
+$KhmerShifter = [\u17C9\u17CA];
+# S = subscript consonant or independent vowel sign
+$KhmerSub = ([\u17D2] $KhmerBase);
+# V = dependent vowel sign
+$KhmerVowel = [\u17B4-\u17C5];
+# Z = zero-width joiner or non-joiner
+$KhmerZWC = [\u200C\u200D];
+# O = any other sign
+$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
+
+$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+$KhmerJoinedSyllableEx {200};
+
+#
+# default numeric rules
+#
+$NumericEx {100};
+$NumericEx $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx      $ExtendNumLetEx {100};  
+$ExtendNumLetEx $NumericEx  {100};
Index: contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Lao.rbbi
===================================================================
--- contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Lao.rbbi	(revision 0)
+++ contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Lao.rbbi	(revision 0)
@@ -0,0 +1,215 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Lao text, with syllable as token.
+# This is an example of how one can use a tailored ruleset.
+# ICUBigramFilter can then be used to form bigrams of these syllables downstream
+#
+# The definition of Lao syllable is based from:
+#
+#   Syllabification of Lao Script for Line Breaking
+#   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
+#     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
+#   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+#	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+#
+# NOTE:
+# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
+# For this reason, this RBBI grammar really only works with LaoBreakIterator
+# For example, what appears to be a final consonant might instead be part of the next syllable.
+# The first rule will match in a greedy fashion, leaving an illegal sequence that matches no rules.
+# The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
+#
+# What LaoBreakIterator does, according to the paper:
+#  1. backtrack and remove the ດ from the last syllable, placing it on the current syllable.
+#  2. verify the modified previous syllable (ກວ່າ ) is still legal.
+#  3. verify the modified current syllable (ດອກ) is now legal.
+#  4. If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
+#
+# Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
+# This is the issue of combining marks being in the wrong order (typos)
+#
+# TODO: Since we turned off rule-chaining, write the numeric rules in unchained form.
+#
+# Syllable structure, where X is the nuclear consonant:
+#
+#           +----+
+#           | X5 |
+#           +----+
+#           | X4 |
+# +----+----+----+----+----+----+----+-----+
+# | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
+# +----+----+----+----+----+----+----+-----+
+#           | X2 |
+#           +----+
+#           | X3 |
+#           +----+
+#
+# X0 represents a vowel which occurs before the nuclear consonant. 
+# It can always define the beginning of syllable.
+$X0 = [\u0EC0-\u0EC4];
+# X1 is a combination consonant which comes before the nuclear consonant, 
+# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
+$X1 = [\u0EAB];
+# X represents the nuclear consonant.
+$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
+# X2 is a combination consonant which comes after the nuclear consonant, 
+# which is placed under or next to the nuclear consonant.
+$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
+# X3 represents a vowel which occurs under the nuclear consonant.
+$X3 = [\u0EB8\u0EB9];
+# X4 represents a vowel which occurs above the nuclear consonant. 
+$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
+# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
+$X5 = [\u0EC8-\u0ECB];
+# X6 represents a consonant vowel, which occurs after the nuclear consonant. 
+# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
+$X6 = [\u0EA7\u0EAD\u0EBD];
+# X7 represents a final vowel. 
+# However X7_1 always represents the end of syllable and it never exists with tone mark.
+$X7 = [\u0EB0\u0EB2\u0EB3];
+# X8 represents an alternate consonant.
+$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
+# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
+$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
+# X10 represents a sign mark. 
+# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
+$X10 = [\u0EAF\u0EC6\u0ECC];
+
+# Section 1
+$X0_1 = [\u0EC0];
+$X4_1_2 = [\u0EB4\u0EB5];
+$X4_3_4 = [\u0EB6\u0EB7];
+$X4_6 = [\u0EBB];
+$X4_7 = [\u0EB1];
+$X6_2 = [\u0EAD];
+$X6_3 = [\u0EBD];
+$X7_1 = [\u0EB0];
+$X7_2 = [\u0EB2];
+$X10_1 = [\u0EAF];
+$X10_2 = [\u0EC6];
+$X10_3 = [\u0ECC];
+
+$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
+$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
+
+# Section 2
+$X0_2 = [\u0EC1];
+
+$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
+$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
+
+$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
+
+# Section 3
+$X0_3 = [\u0EC2];
+$X8_3 = [\u0E8D];
+$X8_8 = [\u0EA7];
+
+$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
+$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
+
+$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
+
+# Section 4
+$X0_4 = [\u0EC4];
+$X6_1 = [\u0EA7];
+
+$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 5
+$X0_5 = [\u0EC3];
+
+$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 6
+$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 7
+$X4_1_4 = [\u0EB4-\u0EB7];
+
+$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 8
+$X4_5 = [\u0ECD];
+
+$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 9
+
+$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
+
+$Rule9 = ($Rule9_1 | $Rule9_2);
+
+# Section 10
+$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 11
+$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 12
+$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
+
+# Section 13
+$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 14
+$X7_3 = [\u0EB3];
+
+$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+$LaoJoinedSyllableEx {200};
+
+#
+# default numeric rules
+#
+$NumericEx {100};
+$NumericEx $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx      $ExtendNumLetEx {100};  
+$ExtendNumLetEx $NumericEx  {100};
Index: contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Myanmar.rbbi
===================================================================
--- contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Myanmar.rbbi	(revision 0)
+++ contrib/icu/src/resources/org/apache/lucene/icu/tokenizer/Myanmar.rbbi	(revision 0)
@@ -0,0 +1,58 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Myanmar text, with syllable as token. 
+# ICUBigramFilter can be used to form bigrams of these syllables downstream.
+#
+!!chain;
+#
+
+$Cons = [[:Other_Letter:]&[:Myanmar:]];
+$Virama = [\u1039];
+$Asat = [\u103A];
+
+$WordJoin = [:Line_Break=Word_Joiner:]; 
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$ConsEx = $Cons ($Extend | $Format)*;
+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
+
+!!forward;
+$MyanmarJoinedSyllableEx {200};
+
+#
+# default numeric rules
+#
+$NumericEx {100};
+$NumericEx $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx      $ExtendNumLetEx {100};  
+$ExtendNumLetEx $NumericEx  {100};
Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUAnalyzer.java	(revision 0)
@@ -0,0 +1,208 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestICUAnalyzer extends BaseTokenStreamTestCase {
+  private Analyzer a = new ICUAnalyzer();
+  
+  /* simple tests from a few sample different writing systems and languages
+   * TODO: more tests
+   */
+  
+  public void testArmenian() throws Exception {
+    assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
+        new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
+  }
+  
+  public void testAmharic() throws Exception {
+    assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
+        new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
+  }
+  
+  public void testArabic() throws Exception {
+    assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
+        new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); 
+  }
+  
+  public void testAramaic() throws Exception {
+    assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
+  }
+  
+  public void testBengali() throws Exception {
+    assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
+        new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
+        "শুরু", "15", "জানুয়ারি", "2001", "সালে", "এখন", "পর্যন্ত", "200টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
+  }
+  
+  public void testFarsi() throws Exception {
+    assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
+        new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "25", "دی", "1379", "به", "صورت", "مکملی",
+        "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
+  }
+  
+  public void testGreek() throws Exception {
+    assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
+        new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
+  }
+  
+  public void testLao() throws Exception {
+    assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ", "ກວ່າດອກ" });
+  }
+  
+  public void testThai() throws Exception {
+    assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
+        new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "1234"});
+  }
+  
+  public void testTibetan() throws Exception {
+    assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+        new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+  }
+  
+  /*
+   * For chinese, use bigrams
+   * TODO: why do full-width numerics have no word-break prop?
+   */
+  public void testChinese() throws Exception {
+    assertAnalyzesTo(a, "我是中国人。 １２３４ Ｔｅｓｔｓ ",
+        new String[] { "我", "是", "我是", "中", "是中", "国", "中国", "人", "国人", "tests"});
+  }
+  
+  public void testEmpty() throws Exception {
+    assertAnalyzesTo(a, "", new String[] {});
+    assertAnalyzesTo(a, ".", new String[] {});
+    assertAnalyzesTo(a, " ", new String[] {});
+  }
+  
+  /* test various jira issues this analyzer is related to */
+  
+  public void testLUCENE1032() throws Exception {
+    /*
+     * Some of our Japanese customers are reporting errors when performing searches using half width characters.
+     * The desired behavior is that a document containing half width characters should be returned when performing
+     * a search using full width equivalents or when searching by the half width character itself.
+     * Currently, a search will not return any matches for half width characters. 
+     *
+     * ICUAnalyzer normalizes to NFKC by default.
+     */
+    byte[] fullWidthKa = new byte[]{(byte) 0xE3, (byte) 0x82, (byte) 0xAB};
+    byte[] halfWidthKa = new byte[]{(byte) 0xEF, (byte) 0xBD, (byte) 0xB6};
+
+    assertAnalyzesTo(a, new String(halfWidthKa, "UTF-8"), new String[] { new String(fullWidthKa, "UTF-8") });
+    assertAnalyzesTo(a, new String(fullWidthKa, "UTF-8"), new String[] { new String(fullWidthKa, "UTF-8") });
+  }
+  
+  public void testLUCENE1215() throws Exception {
+    /*
+     * New in java 6, we have java.text.Normalizer that supports Unicode Standard Annex #15 normalization.
+     * 
+     * FYI: The java6 impl is String-only, and does not provide quickCheck, although isNormalized MIGHT invoke quickCheck.
+     * 
+     * ICUAnalyzer normalizes to NFKC by default (though this can be changed).
+     * See tests for ICUNormalizationFilter.
+     */
+
+    assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" }); // arabic presentation forms
+  }
+  
+  public void testLUCENE1545() throws Exception {
+    /*
+     * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
+     * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
+     * Expected result is only on token "moͤchte".
+     * 
+     * ICUAnalyzer implements Unicode Text Segmentation, which never separates a combining mark from its base character.
+     */
+    assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
+  }
+  
+ // public void testLUCENE1161() throws Exception {
+    /*
+     * It would be useful, in the StandardTokenizer, to be able to have more control over in-word punctuation is handled. 
+     * For instance, it is not always desirable to split on dashes or other punctuation.
+     * 
+     * Figure out nice example of this with a tailored ruleset.
+     * Its possible to even do something such as provide DBBI for hyphenation, etc etc.
+     */
+ // }
+  
+  /* Tests from StandardAnalyzer, just to show behavior is similar */
+  public void testAlphanumericSA() throws Exception {
+    // alphanumeric tokens
+    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+    assertAnalyzesTo(a, "2B", new String[]{"2b"});
+  }
+
+  public void testDelimitersSA() throws Exception {
+    // other delimiters: "-", "/", ","
+    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+  }
+
+  public void testApostrophesSA() throws Exception {
+    // internal apostrophes: O'Reilly, you're, O'Reilly's
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
+  }
+
+  public void testNumericSA() throws Exception {
+    // floating point, serial, model numbers, ip addresses, etc.
+    // every other segment must have at least one digit
+    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "২০৬৭০৩", new String[]{"206703"});
+  }
+
+  public void testTextWithNumbersSA() throws Exception {
+    // numbers
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+  }
+
+  public void testVariousTextSA() throws Exception {
+    // various
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+  }
+
+  public void testKoreanSA() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+  }
+  
+  public void testReusableTokenStream() throws Exception {
+    assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
+        new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUBigramFilter.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUBigramFilter.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUBigramFilter.java	(revision 0)
@@ -0,0 +1,50 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.icu.tokenizer.ICUTokenizer;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Normalizer;
+
+public class TestICUBigramFilter extends BaseTokenStreamTestCase {
+  
+  public void testBasicFunctionality() throws Exception {
+    TokenStream ts = new ICUBigramFilter(new ICUTokenizer(new StringReader("我购买了")), 
+        Normalizer.NFC, new int [] { UScript.HAN });
+    assertTokenStreamContents(ts, new String[] { "我", "购", "我购", "买", "购买", "了", "买了" });
+  }
+  
+  
+  public void testMixedScripts() throws Exception {
+    TokenStream ts = new ICUBigramFilter(new ICUTokenizer(new StringReader("test我购买了abcd")), 
+        Normalizer.NFC, new int [] { UScript.HAN });
+    assertTokenStreamContents(ts, new String[] { "test", "我", "购", "我购", "买", "购买", "了", "买了", "abcd" });
+  }
+  
+  public void testMixedScripts2() throws Exception {
+    TokenStream ts = new ICUBigramFilter(new ICUTokenizer(new StringReader("test我购买了ချောတဲ့လူabcd")),
+      Normalizer.NFC, new int [] { UScript.HAN, UScript.MYANMAR });
+    assertTokenStreamContents(ts, new String[] { "test", "我", "购", "我购", "买", "购买", "了", "买了",
+        "ချော", "တဲ့", "ချောတဲ့", "လူ", "တဲ့လူ", "abcd" });
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUBigramFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUCaseFoldingFilter.java	(revision 0)
@@ -0,0 +1,196 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.Normalizer;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.text.Normalizer.Mode;
+
+public class TestICUCaseFoldingFilter extends BaseTokenStreamTestCase {
+  private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+  
+  private TokenFilter nfkcFold = new ICUCaseFoldingFilter(tokenizer, Normalizer.NFKC);
+  private TokenFilter nfcFold = new ICUCaseFoldingFilter(tokenizer, Normalizer.NFC);
+  private TokenFilter nfkdFold = new ICUCaseFoldingFilter(tokenizer, Normalizer.NFKD);
+  private TokenFilter nfdFold = new ICUCaseFoldingFilter(tokenizer, Normalizer.NFD);
+  private TokenFilter noneFold = new ICUCaseFoldingFilter(tokenizer, Normalizer.NONE);
+  
+  private TokenFilter nfkcNorm = new ICUNormalizationFilter(nfkcFold, Normalizer.NFKC);
+  private TokenFilter nfcNorm = new ICUNormalizationFilter(nfcFold, Normalizer.NFC);
+  private TokenFilter nfkdNorm = new ICUNormalizationFilter(nfkdFold, Normalizer.NFKD);
+  private TokenFilter nfdNorm = new ICUNormalizationFilter(nfdFold, Normalizer.NFD);
+  private TokenFilter noneNorm = new ICUNormalizationFilter(noneFold, Normalizer.NONE);
+
+  /**
+   * Some basic case-folding, including a multi-codepoint folding and a supplementary codepoint folding.
+   */
+  public void testBasicFunctionality() throws Exception {
+    checkToken(Normalizer.NONE, "LuCeNE", "lucene"); // latin
+    checkToken(Normalizer.NONE, "Ruß", "russ"); // german
+    checkToken(Normalizer.NONE, "𐐖", "𐐾"); // suppl. codepoint
+  }
+
+  /**
+   * Validate the case-folder folds all unicode codepoints the same way as the UCharacter String-based method.
+   */
+  public void testUnicodeSet() throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      String string = it.getString();
+      checkToken(Normalizer.NONE, string, icuFold(string));
+    }
+  }
+
+  /**
+   * Case-fold the entire set of unicode codepoints as one huge term, and validate it against the the UCharacter String-based method
+   */
+  public void testUnicodeSetOneTerm() throws Exception {
+    String allUnicode = unicodeSetString();
+    checkToken(Normalizer.NONE, allUnicode, icuFold(allUnicode));
+  }
+
+
+  /**
+   * Validate closure under normalization mode NFC. This is the default for the case-folding algorithm, but it should still work!
+   */
+  public void testUnicodeClosureNFC() throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      String string = it.getString();
+      checkTokenClosure(Normalizer.NFC, string, icuNFC(icuFold(icuNFC(icuFold(string)))));
+    }
+  }
+  
+  /**
+   * Validate closure under normalization mode NFD. This is the default for the case-folding algorithm, but it should still work!
+   */
+  public void testUnicodeClosureNFD() throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      String string = it.getString();
+      checkTokenClosure(Normalizer.NFD, string, icuNFD(icuFold(icuNFD(icuFold(string)))));
+    }
+  }
+
+  /**
+   * Validate closure under normalization mode NFKC. 
+   * In this case the filter will apply NFKC_Closure set to prevent from having to normalize, fold, normalize, fold.
+   */
+  public void testUnicodeClosureNFKC() throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      String string = it.getString();
+      checkTokenClosure(Normalizer.NFKC, string, icuNFKC(icuFold(icuNFKC(icuFold(string)))));
+    }
+  }
+
+  /**
+   * Validate closure under normalization mode NFKD. 
+   * In this case the filter will apply NFKC_Closure set to prevent from having to normalize, fold, normalize, fold.
+   */
+  public void testUnicodeClosureNFKD() throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      String string = it.getString();
+      checkTokenClosure(Normalizer.NFKD, string, icuNFKD(icuFold(icuNFKD(icuFold(string)))));
+    }
+  }
+
+  String unicodeSetString() {
+    StringBuilder sb = new StringBuilder();
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) 
+      sb.append(it.getString());
+    return sb.toString();
+  }
+
+  String icuFold(String s) {
+    return UCharacter.foldCase(s, true);
+  }
+
+  String icuNFKD(String s) {
+    return Normalizer.normalize(s, Normalizer.NFKD);
+  }
+
+  String icuNFD(String s) {
+    return Normalizer.normalize(s, Normalizer.NFD);
+  }
+
+  String icuNFC(String s) {
+    return Normalizer.normalize(s, Normalizer.NFC);
+  }
+
+  String icuNFKC(String s) {
+    return Normalizer.normalize(s, Normalizer.NFKC);
+  }
+  
+  TokenFilter getFilterForMode(Mode mode) {
+    if (mode == Normalizer.NFC)
+      return nfcFold;
+    else if (mode == Normalizer.NFD)
+      return nfdFold;
+    else if (mode == Normalizer.NFKD)
+      return nfkdFold;
+    else if (mode == Normalizer.NFKC)
+      return nfkcFold;
+    else if (mode == Normalizer.NONE)
+      return noneFold;
+    else
+      throw new UnsupportedOperationException("this test class does not support testing this mode!");
+  }
+  
+  TokenFilter getFilterForModeClosure(Mode mode) {
+    if (mode == Normalizer.NFC)
+      return nfcNorm;
+    else if (mode == Normalizer.NFD)
+      return nfdNorm;
+    else if (mode == Normalizer.NFKD)
+      return nfkdNorm;
+    else if (mode == Normalizer.NFKC)
+      return nfkcNorm;
+    else if (mode == Normalizer.NONE)
+      return noneNorm;
+    else
+      throw new UnsupportedOperationException("this test class does not support testing this mode!");
+  }
+
+  void checkToken(Mode mode, String input, String expected) throws IOException {
+    tokenizer.reset(new StringReader(input));
+    TokenStream filter = getFilterForMode(mode);
+    filter.reset();
+    assertTokenStreamContents(filter, new String[] { expected });
+  }
+
+  void checkTokenClosure(Mode mode, String input, String expected) throws IOException {
+    tokenizer.reset(new StringReader(input));
+    TokenStream filter = getFilterForModeClosure(mode);
+    filter.reset();
+    assertTokenStreamContents(filter, new String[] { expected });
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUCaseFoldingFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUFormatFilter.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUFormatFilter.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUFormatFilter.java	(revision 0)
@@ -0,0 +1,31 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestICUFormatFilter extends BaseTokenStreamTestCase {
+  public void testBasicFunctionality() throws Exception {
+    TokenStream ts = new ICUFormatFilter(new KeywordTokenizer(new StringReader("क्‍ष")));
+    assertTokenStreamContents(ts, new String[] { "क्ष" });
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUFormatFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUNormalizationFilter.java	(revision 0)
@@ -0,0 +1,109 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+import com.ibm.icu.text.Normalizer;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.text.Normalizer.Mode;
+
+
+/**
+ * TODO: maybe just use the Unicode normalization test suite?
+ *
+ */
+public class TestICUNormalizationFilter extends BaseTokenStreamTestCase {
+  private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+  
+  private TokenFilter nfkcNorm = new ICUNormalizationFilter(tokenizer, Normalizer.NFKC);
+  private TokenFilter nfcNorm = new ICUNormalizationFilter(tokenizer, Normalizer.NFC);
+  private TokenFilter nfkdNorm = new ICUNormalizationFilter(tokenizer, Normalizer.NFKD);
+  private TokenFilter nfdNorm = new ICUNormalizationFilter(tokenizer, Normalizer.NFD);
+  private TokenFilter noneNorm = new ICUNormalizationFilter(tokenizer, Normalizer.NONE);
+  
+  TokenFilter getFilterForMode(Mode mode) {
+    if (mode == Normalizer.NFC)
+      return nfcNorm;
+    else if (mode == Normalizer.NFD)
+      return nfdNorm;
+    else if (mode == Normalizer.NFKD)
+      return nfkdNorm;
+    else if (mode == Normalizer.NFKC)
+      return nfkcNorm;
+    else if (mode == Normalizer.NONE)
+      return noneNorm;
+    else
+      throw new UnsupportedOperationException("this test class does not support testing this mode!");
+  }
+  
+  /**
+   * Validate the given String normalizes correctly in the provided mode against the ICU normalizer.
+   * 
+   */
+  void check(String input, Normalizer.Mode mode) throws Exception {	
+    String expected = Normalizer.normalize(input, mode);
+    tokenizer.reset(new StringReader(input));
+    TokenStream filter = getFilterForMode(mode);
+    assertTokenStreamContents(filter, new String[] { expected });   
+  }
+
+  /**
+   * Validate all unicode codepoints against the given mode.
+   */
+  void unicodeCompliance(Normalizer.Mode mode) throws Exception {
+    UnicodeSet set = new UnicodeSet(UnicodeSet.MIN_VALUE, UnicodeSet.MAX_VALUE);
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();)
+      check(it.getString(), mode);
+  }
+
+  public void testComplianceNFC() throws Exception {
+    unicodeCompliance(Normalizer.NFC);
+  }
+
+  public void testComplianceNFKC() throws Exception {
+    unicodeCompliance(Normalizer.NFKC);
+  }
+
+  public void testComplianceNFD() throws Exception {
+    unicodeCompliance(Normalizer.NFD);
+  }
+
+  public void testComplianceNFKD() throws Exception {
+    unicodeCompliance(Normalizer.NFKD);
+  }
+  
+  public void testComplianceNone() throws Exception {
+    unicodeCompliance(Normalizer.NONE);
+  }
+  
+  public void testHugeExpansion() throws Exception {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 1000; i++)
+      sb.append('\uFDFA');
+    String hugeString = sb.toString();
+    check(hugeString, Normalizer.NFKC);
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUNormalizationFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/TestICUTransformFilter.java	(revision 0)
@@ -0,0 +1,92 @@
+package org.apache.lucene.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.icu.tokenizer.ICUTokenizer;
+
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UnicodeSet;
+
+
+/**
+ * Test the ICUTransformFilter with some basic examples.
+ */
+public class TestICUTransformFilter extends BaseTokenStreamTestCase {
+  
+  public void testBasicFunctionality() throws Exception {
+    checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字"); 
+    checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな");
+    checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "ｱﾙｱﾉﾘｳ");
+    checkToken(Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
+    checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), "Alphabētikós Katálogos", "Alphabetikos Katalogos");
+  }
+  
+  public void testCustomFunctionality() throws Exception {
+    String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
+    checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
+  }
+  
+  public void testOptimizer() throws Exception {
+    String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
+    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
+    assertTrue(custom.getFilter() == null);
+    new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
+    assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
+  }
+  
+  public void testOptimizerSurrogate() throws Exception {
+    String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
+    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
+    assertTrue(custom.getFilter() == null);
+    new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
+    assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
+  }
+  
+  public void testLUCENE1343() throws Exception {
+    /*
+     * A replacement for ISOLatin1AccentFilter that does a more thorough job of removing diacritical marks or non-spacing modifiers.
+     * This issue also had a normalization impl, but no quickCheck.
+     * 
+     * ICUAnalyzer does NOT remove any accents by default!!!!
+     * This is language-specific/usually incorrect behavior, but if you want to do this kind of thing, the components are here.
+     * 
+     * If you want to do some additional custom mappings, you can create a custom Transliterator.
+     */
+    Analyzer a = new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new ICUTransformFilter(
+            new ICUTokenizer(reader), Transliterator.getInstance("NFKD; [:Nonspacing Mark:] Remove; NFC"));
+      } };
+    
+    assertAnalyzesTo(a, "Sorcie\u0300re sorcière Pe\u0301rez Matilde Pérez A\u0308\uFB03ne Äffine ",
+                        new String[] { "Sorciere","sorciere","Perez", "Matilde", "Perez", "Affine", "Affine" });
+  }
+
+  private void checkToken(Transliterator transform, String input, String expected) throws IOException {
+    TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform);
+    assertTokenStreamContents(ts, new String[] { expected });
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\TestICUTransformFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestCharArrayIterator.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestCharArrayIterator.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestCharArrayIterator.java	(revision 0)
@@ -0,0 +1,108 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayIterator extends LuceneTestCase {
+  public void testBasicUsage() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals(0, ci.getBeginIndex());
+    assertEquals(7, ci.getEndIndex());
+    assertEquals(0, ci.getIndex());
+    assertEquals('t', ci.current());
+    assertEquals('e', ci.next());
+    assertEquals('g', ci.last());
+    assertEquals('n', ci.previous());
+    assertEquals('t', ci.first());
+    assertEquals(CharacterIterator.DONE, ci.previous());
+  }
+  
+  public void testFirst() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    ci.next();
+    // Sets the position to getBeginIndex() and returns the character at that position. 
+    assertEquals('t', ci.first());
+    assertEquals(ci.getBeginIndex(), ci.getIndex());
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.first());
+  }
+  
+  public void testLast() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
+    // and returns the character at that position. 
+    assertEquals('g', ci.last());
+    assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.last());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+  }
+  
+  public void testCurrent() {
+    CharArrayIterator ci = new CharArrayIterator();
+    // Gets the character at the current position (as returned by getIndex()). 
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals('t', ci.current());
+    ci.last();
+    ci.next();
+    // or DONE if the current position is off the end of the text.
+    assertEquals(CharacterIterator.DONE, ci.current());
+  }
+  
+  public void testNext() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("te".toCharArray(), 0, 2);
+    // Increments the iterator's index by one and returns the character at the new index.
+    assertEquals('e', ci.next());
+    assertEquals(1, ci.getIndex());
+    // or DONE if the new position is off the end of the text range.
+    assertEquals(CharacterIterator.DONE, ci.next());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+  }
+  
+  public void testSetIndex() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("test".toCharArray(), 0, "test".length());
+    try {
+      ci.setIndex(5);
+      fail();
+    } catch (Exception e) {
+      assertTrue(e instanceof IllegalArgumentException);
+    }
+  }
+  
+  public void testClone() {
+    char text[] = "testing".toCharArray();
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.next();
+    CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
+    assertEquals(ci.getIndex(), ci2.getIndex());
+    assertEquals(ci.next(), ci2.next());
+    assertEquals(ci.last(), ci2.last());
+  }
+  
+
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\tokenizer\TestCharArrayIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestICUTokenizer.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestICUTokenizer.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestICUTokenizer.java	(revision 0)
@@ -0,0 +1,58 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+import java.util.Arrays;
+
+public class TestICUTokenizer extends BaseTokenStreamTestCase {
+  
+  public void testHugeDoc() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    char whitespace[] = new char[4094];
+    Arrays.fill(whitespace, ' ');
+    sb.append(whitespace);
+    sb.append("testing 1234");
+    String input = sb.toString();
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+  }
+  
+  public void testHugeTerm2() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 40960; i++) {
+      sb.append('a');
+    }
+    String input = sb.toString();
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    char token[] = new char[4096];
+    Arrays.fill(token, 'a');
+    String expectedToken = new String(token);
+    String expected[] = { 
+        expectedToken, expectedToken, expectedToken, 
+        expectedToken, expectedToken, expectedToken,
+        expectedToken, expectedToken, expectedToken,
+        expectedToken
+    };
+    assertTokenStreamContents(tokenizer, expected);
+  }
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\tokenizer\TestICUTokenizer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestLaoBreakIterator.java
===================================================================
--- contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestLaoBreakIterator.java	(revision 0)
+++ contrib/icu/src/test/org/apache/lucene/icu/tokenizer/TestLaoBreakIterator.java	(revision 0)
@@ -0,0 +1,77 @@
+package org.apache.lucene.icu.tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+public class TestLaoBreakIterator extends LuceneTestCase {
+  private LaoBreakIterator wordIterator;
+  
+  protected void setUp() throws Exception {
+    super.setUp();
+    InputStream is = TestLaoBreakIterator.class.getResourceAsStream("Lao.brk");
+    wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
+    is.close();
+  }
+  
+  protected void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
+    char text[] = sourceText.toCharArray();
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText(text, 0, text.length);
+    iterator.setText(ci);
+    
+    for (int i = 0; i < tokens.length; i++) {
+      int start, end;
+      do {
+        start = iterator.current();
+        end = iterator.next();
+      } while (end != BreakIterator.DONE && !isWord(text, start, end));
+      assertTrue(start != BreakIterator.DONE);
+      assertTrue(end != BreakIterator.DONE);
+      assertEquals(tokens[i], new String(text, start, end - start));
+    }
+    
+    assertTrue(iterator.next() == BreakIterator.DONE);
+  }
+  
+  protected boolean isWord(char text[], int start, int end) {
+    int codepoint;
+    for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
+      codepoint = UTF16.charAt(text, 0, end, start);
+
+      if (UCharacter.isLetterOrDigit(codepoint))
+        return true;
+      }
+
+    return false;
+  }
+  
+  public void testBasicUsage() throws Exception {
+    assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
+    assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
+    assertBreaksTo(wordIterator, "", new String[] {});
+  }
+ 
+}

Property changes on: contrib\icu\src\test\org\apache\lucene\icu\tokenizer\TestLaoBreakIterator.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/tools/java/com/ibm/icu/text/RBBIRuleBuilderTunnel.java
===================================================================
--- contrib/icu/src/tools/java/com/ibm/icu/text/RBBIRuleBuilderTunnel.java	(revision 0)
+++ contrib/icu/src/tools/java/com/ibm/icu/text/RBBIRuleBuilderTunnel.java	(revision 0)
@@ -0,0 +1,27 @@
+package com.ibm.icu.text;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class RBBIRuleBuilderTunnel {
+  public static void compileRules(String rules, OutputStream os) throws IOException {
+    RBBIRuleBuilder.compileRules(rules, os);
+  }
+}

Property changes on: contrib\icu\src\tools\java\com\ibm\icu\text\RBBIRuleBuilderTunnel.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/icu/src/tools/java/org/apache/lucene/icu/RBBIRuleCompiler.java
===================================================================
--- contrib/icu/src/tools/java/org/apache/lucene/icu/RBBIRuleCompiler.java	(revision 0)
+++ contrib/icu/src/tools/java/org/apache/lucene/icu/RBBIRuleCompiler.java	(revision 0)
@@ -0,0 +1,89 @@
+package org.apache.lucene.icu;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import com.ibm.icu.text.RBBIRuleBuilderTunnel;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * 
+ *
+ */
+public class RBBIRuleCompiler {
+  
+  static String getRules(File ruleFile) throws IOException {
+    StringBuffer rules = new StringBuffer();
+    InputStream in = new FileInputStream(ruleFile);
+    BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+    String line = null;
+    while ((line = cin.readLine()) != null) {
+      if (!line.startsWith("#"))
+        rules.append(line);
+      rules.append('\n');
+    }
+    cin.close();
+    in.close();
+    return rules.toString();
+  }
+  
+  static void compile(File file) throws IOException {
+    if (file.isDirectory()) {
+      File files[] = file.listFiles(new FilenameFilter() {
+        public boolean accept(File dir, String name) {
+          return name.endsWith("rbbi") || new File(dir, name).isDirectory();
+        }});
+      for (int i = 0; i < files.length; i++)
+        compile(files[i]);
+    } else {
+      File outputFile = new File(file.getParent(), file.getName().replaceAll("rbbi$", "brk"));
+      String rules = getRules(file);
+      System.err.println("Compiling " + file.getName() + " to " + outputFile.getName());
+      /*
+       * if there is a syntax error, compileRules() may succeed.
+       * the way to check is to try to instantiate from the string.
+       * additionally if the rules are invalid, you can get a useful syntax error.
+       */
+      try {
+        new RuleBasedBreakIterator(rules);
+      } catch (IllegalArgumentException e) {
+        /* do this intentionally, so you don't get a massive stack trace
+         * instead, get a useful syntax error!
+         */
+        System.err.println(e.getMessage());
+        System.exit(1);
+      }
+      FileOutputStream os = new FileOutputStream(outputFile);
+      RBBIRuleBuilderTunnel.compileRules(rules, os);
+      os.close();
+    }
+  }
+  
+  public static void main(String args[]) throws Exception {
+    compile(new File("."));
+    System.exit(0);
+  }
+}

Property changes on: contrib\icu\src\tools\java\org\apache\lucene\icu\RBBIRuleCompiler.java
___________________________________________________________________
Added: svn:eol-style
   + native

