Index: src/test/org/apache/lucene/analysis/TestAnalyzers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestAnalyzers.java (revision 784630) +++ src/test/org/apache/lucene/analysis/TestAnalyzers.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -63,6 +64,12 @@ new String[] { "b" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "quoted", "word" }); + + /* supplementary test case. + * simple analyzer will delete all supplementary codepoints, regardless of category. + * recommendation: this should probably be fixed. + */ + assertAnalyzesTo(a, "𐐖𐐖𐐖𐐖𐐖", new String [] {}); } public void testNull() throws Exception { @@ -83,6 +90,12 @@ new String[] { "2B" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "\"QUOTED\"", "word" }); + + /* supplementary test case. + * whitespace analyzer works just fine. + * all whitespace characters are in the BMP, and all other text is left alone + */ + assertAnalyzesTo(a, "𐐖𐐖𐐖𐐖𐐖 𐐖𐐖𐐖𐐖𐐖", new String [] {"𐐖𐐖𐐖𐐖𐐖", "𐐖𐐖𐐖𐐖𐐖"}); } public void testStop() throws Exception { @@ -91,7 +104,31 @@ new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new String[] { "foo", "bar", "foo", "bar" }); + + /* supplementary test case. + * stop analyzer will delete all supplementary codepoints, regardless of category. + * recommendation: this should probably be fixed. + */ + assertAnalyzesTo(a, "𐐖𐐖𐐖𐐖𐐖", new String [] {}); } + + /* supplementary test case. + * lowercasefilter won't mess up supplementary codepoints, it just won't lowercase them. + * recommendation: this should probably be fixed. + */ + public void testLowerCase() throws Exception { + Analyzer a = new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new LowerCaseFilter(new WhitespaceTokenizer(reader)); + } + }; + + assertAnalyzesTo(a, "𐐖𐐖𐐖𐐖𐐖", new String [] { "𐐖𐐖𐐖𐐖𐐖" }); + + // show that these characters should change on a lower case operation.... + int firstChar = Character.codePointAt("𐐖".toCharArray(), 0); + assertFalse(firstChar == Character.toLowerCase(firstChar)); + } void verifyPayload(TokenStream ts) throws IOException { PayloadAttribute payloadAtt = (PayloadAttribute) ts.getAttribute(PayloadAttribute.class); Index: src/test/org/apache/lucene/analysis/TestLengthFilter.java =================================================================== --- src/test/org/apache/lucene/analysis/TestLengthFilter.java (revision 784630) +++ src/test/org/apache/lucene/analysis/TestLengthFilter.java (working copy) @@ -38,5 +38,19 @@ assertEquals("foo", termAtt.term()); assertFalse(filter.incrementToken()); } + + /** + * Supplementary test case. + * The length filter (with min of 2 and max of 6) will delete this 5 character word. + * This is because although it is 5 codepoints, it is 10 UTF-16 code units! + * recommendation: leave alone, but javadoc LengthFilter specifying that it measures length in terms of code units. + */ + public void testSupplementary() throws Exception { + TokenStream stream = new WhitespaceTokenizer( + new StringReader("𐐖𐐖𐐖𐐖𐐖")); + LengthFilter filter = new LengthFilter(stream, 2, 6); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + assertFalse(filter.incrementToken()); + } } Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java =================================================================== --- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 784630) +++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy) @@ -61,6 +61,20 @@ ts.close(); } + /** + * Supplementary test case. + * StandardAnalyzer will delete all supplementary codepoints, regardless of category. + * recommendation: this should probably be fixed. + * + * Once this is fixed, it should also be javadoc'ed that maxTokenLength works in code units! + */ + public void testSupplementary() throws Exception { + assertAnalyzesTo(a, "𐐖𐐖𐐖𐐖𐐖", new String [] {}); + StandardAnalyzer sa = new StandardAnalyzer(); + sa.setMaxTokenLength(6); + + assertAnalyzesTo(sa, "𐐖𐐖𐐖𐐖𐐖", new String [] {}); + } public void testMaxTermLength() throws Exception { StandardAnalyzer sa = new StandardAnalyzer(); Index: src/test/org/apache/lucene/search/TestFuzzyQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestFuzzyQuery.java (revision 784630) +++ src/test/org/apache/lucene/search/TestFuzzyQuery.java (working copy) @@ -249,6 +249,27 @@ directory.close(); } + /** + * Supplementary test. + * Fuzzy query is extremely strange for supplementary chars because it sees them as surrogate pairs. + * In the example below, two words with no characters in common are a match! + * Recommendation: javadoc that score is calculated based on UTF-16 code units. + */ + public void testSupplementary() throws IOException { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + addDoc("𐐖𐐖𐐖𐐖", writer); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory); + + FuzzyQuery query; + // not similar enough: + query = new FuzzyQuery(new Term("field", "𐐗𐐗𐐗𐐗"), 0.4f, 0); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + } + public void testTokenLengthOpt() throws IOException { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), Index: src/test/org/apache/lucene/search/TestWildcard.java =================================================================== --- src/test/org/apache/lucene/search/TestWildcard.java (revision 784630) +++ src/test/org/apache/lucene/search/TestWildcard.java (working copy) @@ -135,7 +135,44 @@ assertMatches(searcher, query5, 0); assertMatches(searcher, query6, 1); // Query: 'meta??' matches 'metals' not 'metal' } + + /** + * Supplementary character test. + * the * operator works as expected. + * The ? operator does not, as it is a substitute for a single code unit, not codepoint. + * recommendation: this should probably be fixed. + */ + public void testSupplementary() + throws IOException { + RAMDirectory indexStore = getIndexStoreWS("body", new String[] {"𐐖𐐖𐐖𐐖𐐖"}); + IndexSearcher searcher = new IndexSearcher(indexStore); + Query query1 = new WildcardQuery(new Term("body", "𐐖𐐖𐐖𐐖𐐖")); + Query query2 = new WildcardQuery(new Term("body", "𐐖𐐖𐐖𐐖*")); + Query query3 = new WildcardQuery(new Term("body", "𐐖𐐖𐐖𐐖?")); + Query query4 = new WildcardQuery(new Term("body", "𐐖𐐖𐐖𐐖??")); // use two question marks and it will work... + assertMatches(searcher, query1, 1); + assertMatches(searcher, query2, 1); + assertMatches(searcher, query3, 0); + assertMatches(searcher, query4, 1); + } + + /* use whitespace analyzer for testing supplementary character behavior */ + private RAMDirectory getIndexStoreWS(String field, String[] contents) + throws IOException { + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter(indexStore, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + for (int i = 0; i < contents.length; ++i) { + Document doc = new Document(); + doc.add(new Field(field, contents[i], Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + return indexStore; + } + + private RAMDirectory getIndexStore(String field, String[] contents) throws IOException { RAMDirectory indexStore = new RAMDirectory();