Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (revision 1373984) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (working copy) @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; public class TestMappingCharFilter extends BaseTokenStreamTestCase { @@ -55,6 +56,8 @@ builder.add( "empty", "" ); + builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef"); + normMap = builder.build(); } @@ -128,6 +131,12 @@ assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5); } + public void testNonBMP() throws Exception { + CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2); + } + // // 1111111111222 // 01234567890123456789012 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (revision 1373984) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (working copy) @@ -111,9 +111,8 @@ final org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE2, outputs); final IntsRef scratch = new IntsRef(); for(Map.Entry ent : pendingPairs.entrySet()) { - builder.add(Util.toUTF32(ent.getKey(), scratch), + builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); - } map = builder.finish(); pendingPairs.clear(); Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1373984) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -767,6 +767,21 @@ } } + /** Just maps each UTF16 unit (char) to the ints in an + * IntsRef. */ + public static IntsRef toUTF16(CharSequence s, IntsRef scratch) { + final int charLimit = s.length(); + scratch.grow(charLimit); + int idx = 0; + while(idx < charLimit) { + scratch.ints[idx] = (int) s.charAt(idx); + idx++; + } + scratch.offset = 0; + scratch.length = idx; + return scratch; + } + /** Decodes the Unicode codepoints from the provided * CharSequence and places them in the provided scratch * IntsRef, which must not be null, returning it. */