Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 829889) +++ CHANGES.txt (working copy) @@ -36,6 +36,10 @@ * LUCENE-2004: Fix Constants.LUCENE_MAIN_VERSION to not be inlined by client code. (Uwe Schindler) + * LUCENE-2016: Replace illegal U+FFFF character with the replacement + char (U+FFFD) during indexing, to prevent silent index corruption. + (Peter Keegan, Mike McCandless) + API Changes * Un-deprecate search(Weight weight, Filter filter, int n) from Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 829889) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -4655,4 +4655,20 @@ w.close(); d.close(); } + + public void testEmbeddedFFFF() throws Throwable { + + Directory d = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(d, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + Document doc = new Document(); + doc.add(new Field("field", "a a\uffffb", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + doc = new Document(); + doc.add(new Field("field", "a", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + w.close(); + + _TestUtil.checkIndex(d); + d.close(); + } } Index: src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 831026) +++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -373,9 +373,11 @@ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; } } - } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) - // Unpaired + } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || + ch == 0xffff)) { + // Unpaired or 0xffff ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; + } code = (code*31) + ch; }