Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes:r967125-979432 Property changes on: solr ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/solr:r967125-979432 Index: solr/common-build.xml =================================================================== --- solr/common-build.xml (revision 979430) +++ solr/common-build.xml (working copy) @@ -44,6 +44,8 @@ + + + + Index: lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (working copy) @@ -27,7 +27,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -88,8 +87,7 @@ int terms = (int) Math.pow(2, bits); RAMDirectory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); Index: lucene/src/test/org/apache/lucene/search/TestNot.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestNot.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestNot.java (working copy) @@ -20,7 +20,6 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.store.RAMDirectory; @@ -40,8 +39,7 @@ public void testNot() throws Exception { RAMDirectory store = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store); Document d1 = new Document(); d1.add(new Field("field", "a b", Field.Store.YES, Field.Index.ANALYZED)); Index: lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java (working copy) @@ -24,7 +24,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.TimeLimitingCollector.TimeExceededException; @@ -78,8 +77,7 @@ "blueberry pizza", }; directory = new RAMDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter iw = new RandomIndexWriter(newRandom(), directory); for (int i=0; i> docs = new ArrayList>(); Document d = new Document(); Field f = new Field("f", "", Field.Store.NO, Field.Index.ANALYZED); Index: lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy) @@ -35,7 +35,6 @@ import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; @@ -91,8 +90,7 @@ } }; Directory store = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, analyzer); Document d = new Document(); d.add(new Field("field", "bogus", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d); @@ -242,8 +240,7 @@ public void testPayloadsPos0() throws Exception { Directory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, new TestPayloadAnalyzer()); Document doc = new Document(); doc.add(new Field("content", new StringReader( "a a b c d e a f g h i j a b k k"))); Index: lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java (working copy) @@ -20,11 +20,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -60,8 +58,7 @@ }; index = new RAMDirectory(); - RandomIndexWriter w = new RandomIndexWriter(rnd, index, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(rnd, index); for (int i = 0; i < data.length; i++) { Document doc = new Document(); Index: lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (working copy) @@ -25,8 +25,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -128,8 +126,7 @@ query.setSlop(slop); RAMDirectory ramDir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, new MockAnalyzer(MockTokenizer.WHITESPACE, false)); writer.addDocument(doc); IndexReader reader = writer.getReader(); Index: lucene/src/test/org/apache/lucene/search/TestBooleanOr.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBooleanOr.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestBooleanOr.java (working copy) @@ -20,11 +20,9 @@ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -143,8 +141,7 @@ Random random = newRandom(); // - RandomIndexWriter writer = new RandomIndexWriter(random, dir, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); // Document d = new Document(); Index: lucene/src/test/org/apache/lucene/search/TestDateSort.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestDateSort.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestDateSort.java (working copy) @@ -26,7 +26,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; @@ -53,8 +52,7 @@ super.setUp(); // Create an index writer. directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); // oldest doc: // Add the first document. text = "Document 1" dateTime = Oct 10 03:25:22 EDT 2007 Index: lucene/src/test/org/apache/lucene/search/TestSort.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSort.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -112,8 +112,7 @@ private Searcher getIndex (boolean even, boolean odd) throws IOException { RAMDirectory indexStore = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, indexStore); for (int i=0; i it's like having a big hairy scary monster in the basement but being upset that it doesn't have fangs - RandomIndexWriter writer = new RandomIndexWriter(random, dir, new IndexWriterConfig(TEST_VERSION_CURRENT, - new MockAnalyzer(MockTokenizer.KEYWORD, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); - + List terms = new ArrayList(); for (int i = 0; i < 2000*_TestUtil.getRandomMultiplier(); i++) { - field.setValue(_TestUtil.randomUnicodeString(random)); + String s = _TestUtil.randomUnicodeString(random); + field.setValue(s); + terms.add(s); writer.addDocument(doc); } + + if (VERBOSE) { + // utf16 order + Collections.sort(terms); + System.out.println("UTF16 order:"); + for(String s : terms) { + System.out.println(" " + UnicodeUtil.toHexString(s)); + } + } + reader = writer.getReader(); searcher = new IndexSearcher(reader); writer.close(); @@ -122,8 +135,11 @@ /** test a bunch of random regular expressions */ public void testRegexps() throws Exception { - for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) - assertSame(AutomatonTestUtil.randomRegexp(random).toString()); + + for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) { + String reg = AutomatonTestUtil.randomRegexp(random).toString(); + assertSame(reg); + } } /** check that the # of hits is the same as from a very Index: lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java (working copy) @@ -20,11 +20,9 @@ import java.io.IOException; import java.util.Arrays; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -48,8 +46,7 @@ public void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); Document doc = new Document(); doc.add(new Field(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Index: lucene/src/test/org/apache/lucene/search/TestDateFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestDateFilter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestDateFilter.java (working copy) @@ -18,12 +18,10 @@ */ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -47,8 +45,7 @@ public void testBefore() throws IOException { // create an index RAMDirectory indexStore = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); long now = System.currentTimeMillis(); @@ -114,8 +111,7 @@ public void testAfter() throws IOException { // create an index RAMDirectory indexStore = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); long now = System.currentTimeMillis(); Index: lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (working copy) @@ -22,13 +22,13 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import java.text.DecimalFormat; +import java.util.Random; import java.io.IOException; /** @@ -80,8 +80,9 @@ super.setUp(); index = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), index, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()) + Random random = newRandom(); + RandomIndexWriter writer = new RandomIndexWriter(random, index, + newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer()) .setSimilarity(sim)); // hed is the most important field, dek is secondary Index: lucene/src/test/org/apache/lucene/search/TestSimilarity.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSimilarity.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestSimilarity.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; import java.util.Collection; +import java.util.Random; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -65,8 +66,9 @@ public void testSimilarity() throws Exception { RAMDirectory store = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()) + Random random = newRandom(); + RandomIndexWriter writer = new RandomIndexWriter(random, store, + newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer()) .setSimilarity(new SimpleSimilarity())); Document d1 = new Document(); Index: lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java (working copy) @@ -19,10 +19,8 @@ import java.util.Random; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; @@ -42,8 +40,7 @@ Directory dir = new RAMDirectory(); Random random = newRandom(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); for (int i = 0; i < 10; i++) { writer.addDocument(new Document()); } Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java (working copy) @@ -51,8 +51,7 @@ super.setUp(); random = newRandom(); dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); Index: lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java (working copy) @@ -18,11 +18,9 @@ import java.util.List; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.spans.SpanTermQuery; @@ -40,8 +38,7 @@ public void testFilterWorks() throws Exception { Directory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); for (int i = 0; i < 500; i++) { Document document = new Document(); document.add(new Field("field", English.intToEnglish(i) + " equals " + English.intToEnglish(i), Index: lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java (working copy) @@ -25,7 +25,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.RandomIndexWriter; @@ -51,8 +50,7 @@ random = newRandom(); dir = new MockRAMDirectory(); // TODO: fix mocktokenizer to not extend chartokenizer, so you can have an 'empty' keyword. - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); Index: lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java (working copy) @@ -24,12 +24,10 @@ import java.util.Random; import java.util.TreeMap; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -59,8 +57,7 @@ super.setUp(); Random rand = newRandom(); index = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(rand, index, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(rand, index); RandomGen random = new RandomGen(rand); for (int i = 0; i < INDEX_SIZE; ++i) { // don't decrease; if to low the // problem doesn't show up Index: lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java (working copy) @@ -18,11 +18,9 @@ */ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -48,8 +46,7 @@ protected void setUp() throws Exception { super.setUp(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < 5137; ++i) { Document doc = new Document(); Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy) @@ -55,8 +55,7 @@ public static void beforeClass() throws Exception { directory = new RAMDirectory(); Random random = newStaticRandom(TestNumericRangeQuery32.class); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); NumericField field8 = new NumericField("field8", 8, Field.Store.YES, true), Index: lucene/src/test/org/apache/lucene/search/TestDocBoost.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestDocBoost.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestDocBoost.java (working copy) @@ -20,10 +20,8 @@ import java.io.IOException; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -40,8 +38,7 @@ public void testDocBoost() throws Exception { RAMDirectory store = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store); Fieldable f1 = new Field("field", "word", Field.Store.YES, Field.Index.ANALYZED); Fieldable f2 = new Field("field", "word", Field.Store.YES, Field.Index.ANALYZED); Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (working copy) @@ -54,8 +54,7 @@ public static void beforeClass() throws Exception { directory = new RAMDirectory(); Random random = newStaticRandom(TestNumericRangeQuery64.class); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); NumericField field8 = new NumericField("field8", 8, Field.Store.YES, true), Index: lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java (working copy) @@ -20,10 +20,8 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -38,8 +36,7 @@ String[] categories = new String[] {"/Computers", "/Computers/Mac", "/Computers/Windows"}; - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < categories.length; i++) { Document doc = new Document(); doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); Index: lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy) @@ -17,7 +17,6 @@ * limitations under the License. */ -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; @@ -25,7 +24,6 @@ import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.BytesRef; import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -46,8 +44,7 @@ public void testPhrasePrefix() throws IOException { MockRAMDirectory indexStore = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); add("blueberry pie", writer); add("blueberry strudel", writer); add("blueberry pizza", writer); @@ -152,8 +149,7 @@ // The contained PhraseMultiQuery must contain exactly one term array. MockRAMDirectory indexStore = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); add("blueberry pie", writer); add("blueberry chewing gum", writer); add("blue raspberry pie", writer); @@ -185,8 +181,7 @@ public void testPhrasePrefixWithBooleanQuery() throws IOException { MockRAMDirectory indexStore = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); add("This is a test", "object", writer); add("a note", "note", writer); @@ -214,8 +209,7 @@ public void testNoDocs() throws Exception { MockRAMDirectory indexStore = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore); add("a note", "note", writer); IndexReader reader = writer.getReader(); Index: lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java (working copy) @@ -22,11 +22,9 @@ import junit.framework.TestSuite; import junit.textui.TestRunner; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.PrefixQuery; @@ -77,8 +75,7 @@ Query rw1 = null; Query rw2 = null; IndexReader reader = null; - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < categories.length; i++) { Document doc = new Document(); doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); Index: lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java (working copy) @@ -17,11 +17,9 @@ * limitations under the License. */ -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; @@ -50,8 +48,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter (newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter (newRandom(), directory); Document doc = new Document(); doc.add (new Field("field", "one two three four five", Field.Store.YES, Field.Index.ANALYZED)); @@ -73,6 +70,11 @@ doc.add (new Field("sorter", "c", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument (doc); + // tests here require single segment (eg try seed + // 8239472272678419952L), because SingleDocTestFilter(x) + // blindly accepts that docID in any sub-segment + writer.optimize(); + reader = writer.getReader(); writer.close (); Index: lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (working copy) @@ -19,11 +19,9 @@ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.store.MockRAMDirectory; @@ -39,8 +37,7 @@ public void testMissingTerms() throws Exception { String fieldName = "field1"; MockRAMDirectory rd = new MockRAMDirectory(); - RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd); for (int i = 0; i < 100; i++) { Document doc = new Document(); int term = i * 10; //terms are units of 10; Index: lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java (working copy) @@ -22,10 +22,8 @@ import java.util.Locale; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.RAMDirectory; @@ -401,8 +399,7 @@ /* build an index */ RAMDirectory farsiIndex = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(rand, farsiIndex, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(rand, farsiIndex); Document doc = new Document(); doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES, Field.Index.NOT_ANALYZED)); @@ -442,8 +439,7 @@ /* build an index */ RAMDirectory danishIndex = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(rand, danishIndex, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(rand, danishIndex); // Danish collation orders the words below in the given order // (example taken from TestSort.testInternationalSort() ). String[] words = {"H\u00D8T", "H\u00C5T", "MAND"}; Index: lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -64,7 +65,7 @@ public void testPrevTermAtEnd() throws IOException { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); Index: lucene/src/test/org/apache/lucene/index/TestRollback.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestRollback.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestRollback.java (working copy) @@ -31,7 +31,7 @@ // LUCENE-2536 public void testRollbackIntegrityWithBufferFlush() throws Exception { Directory dir = new MockRAMDirectory(); - RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir); for (int i = 0; i < 5; i++) { Document doc = new Document(); doc.add(new Field("pk", Integer.toString(i), Store.YES, Index.ANALYZED_NO_NORMS)); Index: lucene/src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReader.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -1675,7 +1675,7 @@ // LUCENE-1586: getUniqueTermCount public void testUniqueTermCount() throws Exception { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1708,7 +1708,7 @@ // LUCENE-1609: don't load terms index public void testNoTermsIndex() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1725,7 +1725,7 @@ } assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); - writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); writer.addDocument(doc); writer.close(); Index: lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -68,10 +68,10 @@ // TODO: verify equals using IW.getReader DocsAndWriter dw = indexRandomIWReader(5, 3, 100, dir); - IndexReader r = dw.writer.getReader(); + IndexReader reader = dw.writer.getReader(); dw.writer.commit(); - verifyEquals(r, dir, "id"); - r.close(); + verifyEquals(r, reader, dir, "id"); + reader.close(); dw.writer.close(); dir.close(); } @@ -261,8 +261,8 @@ w.close(); } - public static void verifyEquals(IndexReader r1, Directory dir2, String idField) throws Throwable { - IndexReader r2 = IndexReader.open(dir2, true); + public static void verifyEquals(Random r, IndexReader r1, Directory dir2, String idField) throws Throwable { + IndexReader r2 = IndexReader.open(dir2); verifyEquals(r1, r2, idField); r2.close(); } Index: lucene/src/test/org/apache/lucene/index/TestFlex.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestFlex.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestFlex.java (working copy) @@ -20,6 +20,8 @@ import java.io.*; import java.util.*; import org.apache.lucene.store.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.standard.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; @@ -64,7 +66,8 @@ public void testTermOrd() throws Exception { Directory d = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED)); w.addDocument(doc); Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -4559,7 +4559,7 @@ dir.close(); } - // LUCENE-2095: make sure with multiple threads commit + // LUCENE-2095: make sure with multiple threads commit // doesn't return until all changes are in fact in the // index public void testCommitThreadSafety() throws Throwable { @@ -4670,19 +4670,19 @@ } // Make sure terms, including ones with surrogate pairs, - // sort in UTF16 sort order by default + // sort in codepoint sort order by default public void testTermUTF16SortOrder() throws Throwable { + Random rnd = newRandom(); Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + RandomIndexWriter writer = new RandomIndexWriter(rnd, dir); Document d = new Document(); // Single segment Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); d.add(f); char[] chars = new char[2]; - Random rnd = newRandom(); final Set allTerms = new HashSet(); - for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) { + for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) { final String s; if (rnd.nextBoolean()) { @@ -4705,14 +4705,13 @@ allTerms.add(s); f.setValue(s); - //System.out.println("add " + termDesc(s)); writer.addDocument(d); if ((1+i) % 42 == 0) { writer.commit(); } } - + IndexReader r = writer.getReader(); // Test each sub-segment Index: lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (working copy) @@ -17,20 +17,18 @@ * limitations under the License. */ -import java.util.Random; import java.io.Closeable; import java.io.IOException; +import java.util.Random; -import org.apache.lucene.util._TestUtil; -import org.apache.lucene.store.Directory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; -import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; -import org.apache.lucene.index.codecs.intblock.IntBlockCodec; -import org.apache.lucene.index.codecs.preflex.PreFlexCodec; -import org.apache.lucene.index.codecs.pulsing.PulsingCodec; -import org.apache.lucene.index.codecs.sep.SepCodec; -import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCaseJ4; +import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; /** Silly class that randomizes the indexing experience. EG * it may swap in a different merge policy/scheduler; may @@ -45,32 +43,48 @@ int docCount; int flushAt; + // Randomly calls Thread.yield so we mixup thread scheduling + private static final class MockIndexWriter extends IndexWriter { + + private final Random r; + + public MockIndexWriter(Random r,Directory dir, IndexWriterConfig conf) throws IOException { + super(dir, conf); + this.r = r; + } + + @Override + boolean testPoint(String name) { + if (r.nextInt(4) == 2) + Thread.yield(); + return true; + } + } + + /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT and MockAnalyzer */ + public RandomIndexWriter(Random r, Directory dir) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, new MockAnalyzer())); + } + + /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT */ + public RandomIndexWriter(Random r, Directory dir, Analyzer a) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, a)); + } + + /** create a RandomIndexWriter with a random config */ + public RandomIndexWriter(Random r, Directory dir, Version v, Analyzer a) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, v, a)); + } + + /** create a RandomIndexWriter with the provided config */ public RandomIndexWriter(Random r, Directory dir, IndexWriterConfig c) throws IOException { this.r = r; - if (r.nextBoolean()) { - c.setMergePolicy(new LogDocMergePolicy()); + w = new MockIndexWriter(r, dir, c); + flushAt = _TestUtil.nextInt(r, 10, 1000); + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW config=" + w.getConfig()); + System.out.println("codec default=" + CodecProvider.getDefaultCodec()); } - if (r.nextBoolean()) { - c.setMergeScheduler(new SerialMergeScheduler()); - } - if (r.nextBoolean()) { - c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000)); - } - if (r.nextBoolean()) { - c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000)); - } - - if (c.getMergePolicy() instanceof LogMergePolicy) { - LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy(); - logmp.setUseCompoundDocStore(r.nextBoolean()); - logmp.setUseCompoundFile(r.nextBoolean()); - logmp.setCalibrateSizeByDeletes(r.nextBoolean()); - } - - c.setReaderPooling(r.nextBoolean()); - c.setCodecProvider(new RandomCodecProvider(r)); - w = new IndexWriter(dir, c); - flushAt = _TestUtil.nextInt(r, 10, 1000); } public void addDocument(Document doc) throws IOException { @@ -89,14 +103,27 @@ w.deleteDocuments(term); } + public void commit() throws CorruptIndexException, IOException { + w.commit(); + } + public int maxDoc() { return w.maxDoc(); } public IndexReader getReader() throws IOException { - if (r.nextBoolean()) { + // If we are writing with PreFlexRW, force a full + // IndexReader.open so terms are sorted in codepoint + // order during searching: + if (!w.codecs.getWriter(null).name.equals("PreFlex") && r.nextBoolean()) { + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW.getReader: use NRT reader"); + } return w.getReader(); } else { + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW.getReader: open new reader"); + } w.commit(); return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10)); } @@ -112,22 +139,4 @@ public void optimize() throws IOException { w.optimize(); } - - class RandomCodecProvider extends CodecProvider { - final String codec; - - RandomCodecProvider(Random random) { - register(new StandardCodec()); - register(new IntBlockCodec()); - register(new PreFlexCodec()); - register(new PulsingCodec()); - register(new SepCodec()); - codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)]; - } - - @Override - public Codec getWriter(SegmentWriteState state) { - return lookup(codec); - } - } } Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestMultiFields.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java (working copy) @@ -27,12 +27,13 @@ public void testRandom() throws Exception { + Random r = newRandom(); + for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) { Directory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); - Random r = new Random(); - Map> docs = new HashMap>(); Set deleted = new HashSet(); List terms = new ArrayList(); @@ -45,7 +46,7 @@ doc.add(id); boolean onlyUniqueTerms = r.nextBoolean(); - + Set uniqueTerms = new HashSet(); for(int i=0;i 0) { @@ -61,6 +62,7 @@ } docs.get(term).add(i); terms.add(term); + uniqueTerms.add(term); f.setValue(s); } id.setValue(""+i); @@ -75,8 +77,18 @@ } } + if (VERBOSE) { + List termsList = new ArrayList(uniqueTerms); + Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); + System.out.println("UTF16 order:"); + for(BytesRef b : termsList) { + System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); + } + } + IndexReader reader = w.getReader(); w.close(); + //System.out.println("TEST reader=" + reader); Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -493,15 +493,22 @@ // Test random seek by ord: final int idx = TestCodecs.this.nextInt(field.terms.length); term = field.terms[idx]; - status = termsEnum.seek(idx); - assertEquals(status, TermsEnum.SeekStatus.FOUND); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); - assertEquals(term.docs.length, termsEnum.docFreq()); - if (field.omitTF) { - this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); - } else { - this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + try { + status = termsEnum.seek(idx); + } catch (UnsupportedOperationException uoe) { + // ok -- skip it + status = null; } + if (status != null) { + assertEquals(status, TermsEnum.SeekStatus.FOUND); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); + assertEquals(term.docs.length, termsEnum.docFreq()); + if (field.omitTF) { + this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); + } else { + this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + } + } // Test seek to non-existent terms: for(int i=0;i<100;i++) { @@ -520,9 +527,12 @@ // Seek to each term by ord, backwards for(int i=field.terms.length-1;i>=0;i--) { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); - assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + try { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + } catch (UnsupportedOperationException uoe) { + } } // Seek to non-existent empty-string term Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (working copy) @@ -394,18 +394,18 @@ } public void testDeletesOnDiskFull() throws IOException { - testOperationsOnDiskFull(false); + doTestOperationsOnDiskFull(false); } public void testUpdatesOnDiskFull() throws IOException { - testOperationsOnDiskFull(true); + doTestOperationsOnDiskFull(true); } /** * Make sure if modifier tries to commit but hits disk full that modifier * remains consistent and usable. Similar to TestIndexReader.testDiskFull(). */ - private void testOperationsOnDiskFull(boolean updates) throws IOException { + private void doTestOperationsOnDiskFull(boolean updates) throws IOException { Term searchTerm = new Term("content", "aaa"); int START_COUNT = 157; @@ -700,6 +700,7 @@ try { modifier.commit(); } catch (IOException ioe) { + // expected failed = true; } Index: lucene/src/test/org/apache/lucene/index/TestAddIndexes.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (working copy) @@ -19,7 +19,6 @@ import java.io.IOException; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -139,7 +138,6 @@ setUpDirs(dir, aux); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND)); - writer.addIndexes(new Directory[] {aux}); // Adds 10 docs, then replaces them with another 10 Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (working copy) @@ -1,225 +0,0 @@ -package org.apache.lucene.index.codecs.preflex; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.IOException; -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.util.*; - - -/** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - -final class TermInfosWriter { - /** The file format version, a negative number. */ - public static final int FORMAT = -3; - - // Changed strings to true utf8 with length-in-bytes not - // length-in-chars - public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; - - // NOTE: always change this if you switch to a new format! - public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - -} Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 979430) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (working copy) @@ -18,8 +18,10 @@ */ import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; import org.apache.lucene.index.*; -import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.util.*; import java.util.*; @@ -30,8 +32,6 @@ public class TestSurrogates extends LuceneTestCaseJ4 { - // chooses from a very limited alphabet to exacerbate the - // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { final int end = r.nextInt(20); if (end == 0) { @@ -44,154 +44,297 @@ if (0 == t && i < end - 1) { // hi - buffer[i++] = (char) 0xd800; + buffer[i++] = (char) (0xd800 + r.nextInt(2)); // lo - buffer[i] = (char) 0xdc00; + buffer[i] = (char) (0xdc00 + r.nextInt(2)); } else if (t <= 3) { - buffer[i] = 'a'; + buffer[i] = (char) ('a' + r.nextInt(2)); } else if (4 == t) { - buffer[i] = 0xe000; + buffer[i] = (char) (0xe000 + r.nextInt(2)); } } return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + private String toHexString(Term t) { + return t.field() + ":" + UnicodeUtil.toHexString(t.text()); + } - final int numField = _TestUtil.nextInt(r, 2, 5); + private String getRandomString(Random r) { + String s; + if (r.nextInt(5) == 1) { + if (r.nextInt(3) == 1) { + s = makeDifficultRandomUnicodeString(r); + } else { + s = _TestUtil.randomUnicodeString(r); + } + } else { + s = _TestUtil.randomRealisticUnicodeString(r); + } + return s; + } - List terms = new ArrayList(); + private static class SortTermAsUTF16Comparator implements Comparator { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + } - int tc = 0; + private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator(); - for(int f=0;f fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException { - fieldInfos.add(field, true, false, false, false, false, false, false); - final int numTerms = 10000*_TestUtil.getRandomMultiplier(); - for(int i=0;i= fieldTerms.size()) { + break; + } + term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } } - w.close(); + } - Collections.sort(fieldTerms); + private void doTestSeekDoesNotExist(Random r, int numField, List fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + if (VERBOSE) { - System.out.println("\nTEST: codepoint order"); - for(Term t: fieldTerms) { - System.out.println(" " + t.field() + ":" + toHexString(t)); - } + System.out.println("TEST: top random seeks"); } - dir.createOutput(segName + ".prx").close(); - dir.createOutput(segName + ".frq").close(); + { + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // seek to random spot + String field = ("f" + r.nextInt(numField)).intern(); + Term tx = new Term(field, getRandomString(r)); - // !!hack alert!! stuffing uniqueTermCount in as docCount - return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec); + int spot = Arrays.binarySearch(fieldTermsArray, tx); + + if (spot < 0) { + if (VERBOSE) { + System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text())); + } + + // term does not exist: + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" got enum"); + } + + spot = -spot - 1; + + if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) { + assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes())); + } else { + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes())); + + if (VERBOSE) { + System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString())); + System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text())); + } + + assertEquals(fieldTerms.get(spot).bytes(), + te.term()); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + Term term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } + + } + } + } + } } - private String toHexString(Term t) { - return t.field() + ":" + UnicodeUtil.toHexString(t.text()); - } - + @Test public void testSurrogatesOrder() throws Exception { + Random r = newRandom(); + Directory dir = new MockRAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter(r, + dir, + newIndexWriterConfig(r, TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec()))); - Codec codec = new PreFlexCodec(); + final int numField = _TestUtil.nextInt(r, 2, 5); - Random r = newRandom(); - FieldInfos fieldInfos = new FieldInfos(); + int uniqueTermCount = 0; + + int tc = 0; + List fieldTerms = new ArrayList(); - SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); - // hack alert!! - int uniqueTermCount = si.docCount; + for(int f=0;f uniqueTerms = new HashSet(); + for(int i=0;i,Object> checkedClasses = Collections.synchronizedMap(new WeakHashMap,Object>()); + // saves default codec: we do this statically as many build indexes in @beforeClass + private static String savedDefaultCodec; + private static String codec; + private static Codec preFlexSav; + + // returns current PreFlex codec + public static Codec installPreFlexRW() { + final Codec preFlex = CodecProvider.getDefault().lookup("PreFlex"); + if (preFlex != null) { + CodecProvider.getDefault().unregister(preFlex); + } + CodecProvider.getDefault().register(new PreFlexRWCodec()); + return preFlex; + } + + // returns current PreFlex codec + public static void restorePreFlex(Codec preFlex) { + Codec preFlexRW = CodecProvider.getDefault().lookup("PreFlex"); + if (preFlexRW != null) { + CodecProvider.getDefault().unregister(preFlexRW); + } + CodecProvider.getDefault().register(preFlex); + } + + @BeforeClass + public static void beforeClassLuceneTestCaseJ4() { + savedDefaultCodec = CodecProvider.getDefaultCodec(); + codec = _TestUtil.getTestCodec(); + if (codec.equals("random")) + codec = CodecProvider.CORE_CODECS[seedRnd.nextInt(CodecProvider.CORE_CODECS.length)]; + + // If we're running w/ PreFlex codec we must swap in the + // test-only PreFlexRW codec (since core PreFlex can + // only read segments): + if (codec.equals("PreFlex")) { + preFlexSav = installPreFlexRW(); + } + + CodecProvider.setDefaultCodec(codec); + } + + @AfterClass + public static void afterClassLuceneTestCaseJ4() { + // Restore read-only PreFlex codec: + if (codec.equals("PreFlex")) { + restorePreFlex(preFlexSav); + } + CodecProvider.setDefaultCodec(savedDefaultCodec); + } + // This is how we get control when errors occur. // Think of this as start/end/success/failed // events. @@ -369,6 +429,34 @@ return new Random(seed); } + /** create a new index writer config with random defaults */ + public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) { + IndexWriterConfig c = new IndexWriterConfig(v, a); + if (r.nextBoolean()) { + c.setMergePolicy(new LogDocMergePolicy()); + } + if (r.nextBoolean()) { + c.setMergeScheduler(new SerialMergeScheduler()); + } + if (r.nextBoolean()) { + c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000)); + } + if (r.nextBoolean()) { + c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000)); + } + + if (c.getMergePolicy() instanceof LogMergePolicy) { + LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy(); + logmp.setUseCompoundDocStore(r.nextBoolean()); + logmp.setUseCompoundFile(r.nextBoolean()); + logmp.setCalibrateSizeByDeletes(r.nextBoolean()); + logmp.setMergeFactor(_TestUtil.nextInt(r, 2, 20)); + } + + c.setReaderPooling(r.nextBoolean()); + return c; + } + public String getName() { return this.name; } @@ -392,6 +480,10 @@ System.out.println("NOTE: random static seed of testclass '" + getName() + "' was: " + staticSeed); } + if (_TestUtil.getTestCodec().equals("random")) { + System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); + } + if (seed != null) { System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed); } @@ -404,5 +496,4 @@ private static final Random seedRnd = new Random(); private String name = ""; - } Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (revision 979430) +++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -29,11 +29,15 @@ import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.ConcurrentMergeScheduler; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; /** * Base class for all Lucene unit tests. @@ -72,6 +76,10 @@ private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null; + private String savedDefaultCodec; + private String codec; + private Codec preFlexSav; + /** Used to track if setUp and tearDown are called correctly from subclasses */ private boolean setup; @@ -110,6 +118,19 @@ ConcurrentMergeScheduler.setTestMode(); savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount(); + savedDefaultCodec = CodecProvider.getDefaultCodec(); + + codec = _TestUtil.getTestCodec(); + if (codec.equals("random")) + codec = CodecProvider.CORE_CODECS[seedRnd.nextInt(CodecProvider.CORE_CODECS.length)]; + + // If we're running w/ PreFlex codec we must swap in the + // test-only PreFlexRW codec (since core PreFlex can + // only read segments): + if (codec.equals("PreFlex")) { + preFlexSav = LuceneTestCaseJ4.installPreFlexRW(); + } + CodecProvider.setDefaultCodec(codec); } /** @@ -135,7 +156,12 @@ assertTrue("ensure your setUp() calls super.setUp()!!!", setup); setup = false; BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount); - + // Restore read-only PreFlex codec: + if (codec.equals("PreFlex")) { + LuceneTestCaseJ4.restorePreFlex(preFlexSav); + } + CodecProvider.setDefaultCodec(savedDefaultCodec); + try { Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler); if (!uncaughtExceptions.isEmpty()) { @@ -264,7 +290,12 @@ this.seed = Long.valueOf(seed); return new Random(seed); } - + + /** create a new index writer config with random defaults */ + public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) { + return LuceneTestCaseJ4.newIndexWriterConfig(r, v, a); + } + /** Gets a resource from the classpath as {@link File}. This method should only be used, * if a real file is needed. To get a stream, code should prefer * {@link Class#getResourceAsStream} using {@code this.getClass()}. @@ -284,6 +315,9 @@ seed = null; super.runBare(); } catch (Throwable e) { + if (_TestUtil.getTestCodec().equals("random")) { + System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); + } if (seed != null) { System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed); } Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 979430) +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -23,6 +23,9 @@ import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.Directory; import java.io.ByteArrayOutputStream; import java.io.PrintStream; @@ -129,9 +132,25 @@ } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + //buffer[i] = (char) (97 + r.nextInt(26)); + + /* if (0 == t && i < end - 1) { + // hi + buffer[i++] = (char) 0xd800; + // lo + buffer[i] = (char) 0xdc00; + } else if (t <= 3) { + buffer[i] = 'a'; + } else if (4 == t) { + buffer[i] = 0xe000; + } + */ + + if (0 == t && i < end - 1) { // Make a surrogate pair // High surrogate buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); @@ -218,4 +237,39 @@ public static int getRandomMultiplier() { return Integer.parseInt(System.getProperty("random.multiplier", "1")); } + + /** gets the codec to run tests with */ + public static String getTestCodec() { + // by default we randomly pick a different codec for + // each test case (non-J4 tests) and each test class (J4 + // tests) + return System.getProperty("tests.codec", "random"); + } + + public static CodecProvider alwaysCodec(final Codec c) { + return new CodecProvider() { + @Override + public Codec getWriter(SegmentWriteState state) { + return c; + } + + @Override + public Codec lookup(String name) { + // can't do this until we fix PreFlexRW to not + //impersonate PreFlex: + if (name.equals(c.name)) { + return c; + } else { + return CodecProvider.getDefault().lookup(name); + } + } + }; + } + + /** Return a CodecProvider that can read any of the + * default codecs, but always writes in the specified + * codec. */ + public static CodecProvider alwaysCodec(final String codec) { + return alwaysCodec(CodecProvider.getDefault().lookup(codec)); + } } Property changes on: lucene\src\test\org\apache\lucene\util\TestAttributeSource.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/util/TestAttributeSource.java:r967125-979432 Index: lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java (revision 979430) +++ lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java (working copy) @@ -2,9 +2,7 @@ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.store.MockRAMDirectory; @@ -58,8 +56,7 @@ /** add the doc to a ram index */ MockRAMDirectory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); /** open a reader and fetch the document */ @@ -98,8 +95,7 @@ /** add the doc to a ram index */ MockRAMDirectory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); /** open a reader and fetch the document */ Property changes on: lucene\src\test\org\apache\lucene\document\TestNumberTools.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:r967125-979432 Property changes on: lucene\src\test\org\apache\lucene\document\TestDateTools.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestDateTools.java:r967125-979432 Index: lucene/src/test/org/apache/lucene/document/TestDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document/TestDocument.java (revision 979430) +++ lucene/src/test/org/apache/lucene/document/TestDocument.java (working copy) @@ -1,8 +1,6 @@ package org.apache.lucene.document; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -155,8 +153,7 @@ */ public void testGetValuesForIndexedDocument() throws Exception { RAMDirectory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(makeDocumentWithFields()); IndexReader reader = writer.getReader(); @@ -234,8 +231,7 @@ Field.Index.NOT_ANALYZED)); RAMDirectory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); field.setValue("id2"); writer.addDocument(doc); Property changes on: lucene\src\java\org\apache\lucene\analysis\Tokenizer.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/src/java/org/apache/lucene/analysis/Tokenizer.java:r967125-979432 Property changes on: lucene\src\java\org\apache\lucene\search\MultiTermQueryWrapperFilter.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:r967125-979432 Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 979430) +++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -792,6 +792,7 @@ throws IOException { String field = StringHelper.intern(entryKey.field); + Terms terms = MultiFields.getTerms(reader, field); final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue(); Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy) @@ -116,7 +116,7 @@ // different TermComps final Comparator subTermComp = termsEnumIndex.termsEnum.getComparator(); if (subTermComp != null && !subTermComp.equals(termComp)) { - throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge"); } } Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1095,7 +1095,7 @@ continue; } assert checkDeleteTerm(term); - + if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); Index: lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.index.FieldsEnum; import java.io.IOException; +import java.io.Closeable; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -30,7 +31,7 @@ * * @lucene.experimental */ -public abstract class FieldsConsumer { +public abstract class FieldsConsumer implements Closeable { /** Add a new field */ public abstract TermsConsumer addField(FieldInfo field) throws IOException; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy) @@ -84,26 +84,16 @@ format = firstInt; // check that it is a format we can understand - if (format > FORMAT_MINIMUM) - throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); - if (format < FORMAT_CURRENT) - throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format > FORMAT_MINIMUM) + throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format < FORMAT_CURRENT) + throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); size = input.readLong(); // read the size - if(format == -1){ - if (!isIndex) { - indexInterval = input.readInt(); - formatM1SkipInterval = input.readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = Integer.MAX_VALUE; - } else { - indexInterval = input.readInt(); - skipInterval = input.readInt(); - maxSkipLevels = input.readInt(); - } + indexInterval = input.readInt(); + skipInterval = input.readInt(); + maxSkipLevels = input.readInt(); assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; } @@ -132,18 +122,21 @@ position = p; termBuffer.set(t); prevBuffer.reset(); + //System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this); termInfo.set(ti); } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { + prevBuffer.set(termBuffer); + //System.out.println(" ste setPrev=" + prev() + " this=" + this); + if (position++ >= size - 1) { - prevBuffer.set(termBuffer); termBuffer.reset(); + //System.out.println(" EOF"); return false; } - prevBuffer.set(termBuffer); termBuffer.read(input, fieldInfos); newSuffixStart = termBuffer.newSuffixStart; @@ -168,6 +161,7 @@ if (isIndex) indexPointer += input.readVLong(); // read index pointer + //System.out.println(" ste ret term=" + term()); return true; } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy) @@ -18,9 +18,10 @@ */ import java.io.IOException; +import java.util.Comparator; + import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.FieldInfos; @@ -28,102 +29,65 @@ private String field; private Term term; // cached - private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) - private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); private BytesRef bytes = new BytesRef(10); - int newSuffixStart; + private static final Comparator utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); - public final int compareTo(TermBuffer other) { + int newSuffixStart; // only valid right after .read is called + + public int compareTo(TermBuffer other) { if (field == other.field) // fields are interned - return compareChars(text.result, text.length, other.text.result, other.text.length); + return utf8AsUTF16Comparator.compare(bytes, other.bytes); else return field.compareTo(other.field); } - private static int compareChars(char[] chars1, int len1, - char[] chars2, int len2) { - final int end = len1 < len2 ? len1:len2; - for (int k = 0; k < end; k++) { - char c1 = chars1[k]; - char c2 = chars2[k]; - if (c1 != c2) { - return c1 - c2; - } - } - return len1 - len2; - } - - public final void read(IndexInput input, FieldInfos fieldInfos) + public void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache - int start = input.readVInt(); + newSuffixStart = input.readVInt(); int length = input.readVInt(); - int totalLength = start + length; + int totalLength = newSuffixStart + length; if (bytes.bytes.length < totalLength) { bytes.grow(totalLength); } - if (dirty) { - // Fully convert all bytes since bytes is dirty - UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); - dirty = false; - } else { - // Incrementally convert only the UTF8 bytes that are new: - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); - } - - while(true) { - newSuffixStart = text.offsets[start]; - if (newSuffixStart != -1) { - break; - } - if (--start == 0) { - newSuffixStart = 0; - break; - } - } + bytes.length = totalLength; + input.readBytes(bytes.bytes, newSuffixStart, length); this.field = fieldInfos.fieldName(input.readVInt()); } - public final void set(Term term) { + public void set(Term term) { if (term == null) { reset(); return; } - - final BytesRef termBytes = term.bytes(); - UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text); - dirty = true; + bytes.copy(term.bytes()); field = term.field(); this.term = term; } - public final void set(TermBuffer other) { - text.copyText(other.text); - dirty = true; + public void set(TermBuffer other) { field = other.field; - term = other.term; + // dangerous to copy Term over, since the underlying + // BytesRef could subsequently be modified: + term = null; + bytes.copy(other.bytes); } public void reset() { field = null; - text.setLength(0); term = null; - dirty = true; } public Term toTerm() { if (field == null) // unset return null; - if (term == null) - term = new Term(field, new BytesRef(text.result, 0, text.length), false); + if (term == null) { + term = new Term(field, new BytesRef(bytes), false); + //term = new Term(field, bytes, false); + } return term; } @@ -134,12 +98,7 @@ try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} - clone.dirty = true; - clone.bytes = new BytesRef(10); - clone.text = new UnicodeUtil.UTF16Result(); - clone.text.offsets = new int[text.offsets.length]; - System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length); - clone.text.copyText(text); + clone.bytes = new BytesRef(bytes); return clone; } } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (working copy) @@ -40,16 +40,16 @@ public class PreFlexCodec extends Codec { /** Extension of terms file */ - static final String TERMS_EXTENSION = "tis"; + public static final String TERMS_EXTENSION = "tis"; /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tii"; + public static final String TERMS_INDEX_EXTENSION = "tii"; /** Extension of freq postings file */ - static final String FREQ_EXTENSION = "frq"; + public static final String FREQ_EXTENSION = "frq"; /** Extension of prox postings file */ - static final String PROX_EXTENSION = "prx"; + public static final String PROX_EXTENSION = "prx"; public PreFlexCodec() { name = "PreFlex"; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (working copy) @@ -23,30 +23,30 @@ * indexing. */ @Deprecated -class TermInfo { +public class TermInfo { /** The number of documents which contain the term. */ - int docFreq = 0; + public int docFreq = 0; - long freqPointer = 0; - long proxPointer = 0; - int skipOffset; + public long freqPointer = 0; + public long proxPointer = 0; + public int skipOffset; - TermInfo() {} + public TermInfo() {} - TermInfo(int df, long fp, long pp) { + public TermInfo(int df, long fp, long pp) { docFreq = df; freqPointer = fp; proxPointer = pp; } - TermInfo(TermInfo ti) { + public TermInfo(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; skipOffset = ti.skipOffset; } - final void set(int docFreq, + public final void set(int docFreq, long freqPointer, long proxPointer, int skipOffset) { this.docFreq = docFreq; this.freqPointer = freqPointer; @@ -54,7 +54,7 @@ this.skipOffset = skipOffset; } - final void set(TermInfo ti) { + public final void set(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (working copy) @@ -119,9 +119,12 @@ indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { + + for (int i=0;indexEnum.next(); i++) { indexTerms[i] = indexEnum.term(); + assert indexTerms[i] != null; + assert indexTerms[i].text() != null; + assert indexTerms[i].field() != null; indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; @@ -160,14 +163,14 @@ return origEnum.maxSkipLevels; } - final void close() throws IOException { + void close() throws IOException { if (origEnum != null) origEnum.close(); threadResources.close(); } /** Returns the number of term/value pairs in the set. */ - final long size() { + long size() { return size; } @@ -183,12 +186,13 @@ /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { + private int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexTerms.length - 1; while (hi >= lo) { int mid = (lo + hi) >>> 1; + assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid; int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; @@ -200,7 +204,7 @@ return hi; } - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); @@ -231,6 +235,9 @@ } TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { + if (size == 0) { + return null; + } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current @@ -242,7 +249,6 @@ // no need to seek final TermInfo ti; - int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); @@ -279,6 +285,7 @@ seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { @@ -294,7 +301,7 @@ } // called only from asserts - private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { + private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.docFreq != ti2.docFreq) { return false; } @@ -319,7 +326,7 @@ } /** Returns the position of a Term in the set or -1. */ - final long getPosition(Term term) throws IOException { + long getPosition(Term term) throws IOException { if (size == 0) return -1; ensureIndexIsRead(); Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 979430) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -40,12 +40,11 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { - + private static final boolean DEBUG_SURROGATES = false; public TermInfosReader tis; @@ -60,7 +59,7 @@ private final int readBufferSize; private Directory cfsReader; - PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) + public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) throws IOException { si = info; @@ -107,6 +106,15 @@ this.dir = dir; } + // If this returns, we do the surrogates dance so that the + // terms are sorted by unicode sort order. This should be + // true when segments are used for "normal" searching; + // it's only false during testing, to create a pre-flex + // index, using the test-only PreFlexRW. + protected boolean sortTermsByUnicode() { + return true; + } + static void files(Directory dir, SegmentInfo info, Collection files) throws IOException { files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION)); files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); @@ -182,6 +190,12 @@ if (cfsReader != null) { cfsReader.close(); } + if (freqStream != null) { + freqStream.close(); + } + if (proxStream != null) { + proxStream.close(); + } } private class PreFlexFieldsEnum extends FieldsEnum { @@ -228,7 +242,11 @@ public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (sortTermsByUnicode()) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } @@ -238,119 +256,225 @@ private boolean skipNext; private BytesRef current; - private int[] surrogateSeekPending = new int[1]; - private boolean[] surrogateDidSeekBack = new boolean[1]; - private int surrogateSeekUpto; - private char[] pendingPrefix; - private SegmentTermEnum seekTermEnum; private Term protoTerm; + + private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0; + private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee; + + // Returns true if the unicode char is "after" the + // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: + private final boolean isHighBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; + } + + // Returns true if the unicode char in the UTF8 byte + // sequence starting at idx encodes a char outside of + // BMP (ie what would be a surrogate pair in UTF16): + private final boolean isNonBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; + } + + private final byte[] scratch = new byte[4]; + private final BytesRef prevTerm = new BytesRef(); + private final BytesRef scratchTerm = new BytesRef(); private int newSuffixStart; - void reset(FieldInfo fieldInfo) throws IOException { - this.fieldInfo = fieldInfo; - protoTerm = new Term(fieldInfo.name); - if (termEnum == null) { - termEnum = getTermsDict().terms(protoTerm); - seekTermEnum = getTermsDict().terms(protoTerm); + // Swap in S, in place of E: + private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException { + final int savLength = term.length; + + assert term.offset == 0; + + // The 3 bytes starting at downTo make up 1 + // unicode character: + assert isHighBMPChar(term.bytes, pos); + + // NOTE: we cannot make this assert, because + // AutomatonQuery legitimately sends us malformed UTF8 + // (eg the UTF8 bytes with just 0xee) + // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); + + // Save the bytes && length, since we need to + // restore this if seek "back" finds no matching + // terms + if (term.bytes.length < 4+pos) { + term.grow(4+pos); + } + + scratch[0] = term.bytes[pos]; + scratch[1] = term.bytes[pos+1]; + scratch[2] = term.bytes[pos+2]; + + term.bytes[pos] = (byte) 0xf0; + term.bytes[pos+1] = (byte) 0x90; + term.bytes[pos+2] = (byte) 0x80; + term.bytes[pos+3] = (byte) 0x80; + term.length = 4+pos; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString())); + } + + // Seek "back": + getTermsDict().seekEnum(te, protoTerm.createTerm(term)); + + // Test if the term we seek'd to in fact found a + // surrogate pair at the same position as the E: + Term t2 = te.term(); + + // Cannot be null (or move to next field) because at + // "worst" it'd seek to the same term we are on now, + // unless we are being called from seek + if (t2 == null || t2.field() != fieldInfo.name) { + return false; + } + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text())); + } + + // Now test if prefix is identical and we found + // a non-BMP char at the same position: + BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + boolean matches; + if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) { + matches = true; + for(int i=0;i 0) { - sb.append(' '); + boolean didSeek = false; + + final int limit = Math.min(newSuffixStart, scratchTerm.length-1); + + while(downTo > limit) { + + if (isHighBMPChar(prevTerm.bytes, downTo)) { + + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length); } - sb.append(surrogateSeekPending[i]); + + if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { + // TODO: more efficient seek? + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + //newSuffixStart = downTo+4; + newSuffixStart = downTo; + scratchTerm.copy(termEnum.term().bytes()); + didSeek = true; + if (DEBUG_SURROGATES) { + System.out.println(" seek!"); + } + break; + } else { + if (DEBUG_SURROGATES) { + System.out.println(" no seek"); + } + } } - sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); - return sb.toString(); + + // Shorten prevTerm in place so that we don't redo + // this loop if we come back here: + if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) { + prevTerm.length = downTo; + } + + downTo--; } + + return didSeek; } - private boolean popPendingSeek() throws IOException { + // Look for seek type 3 ("pop"): if the delta from + // prev -> current was replacing an S with an E, + // we must now seek to beyond that E. This seek + // "finishes" the dance at this character + // position. + private boolean doPop() throws IOException { + if (DEBUG_SURROGATES) { - System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); + System.out.println(" try pop"); } - // if a .next() has advanced beyond the - // after-surrogates range we had last seeked to, we - // must seek back to the start and resume .next from - // there. this pops the pending seek off the stack. - final Term t = termEnum.term(); - if (surrogateSeekUpto > 0) { - final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; + + assert newSuffixStart <= prevTerm.length; + assert newSuffixStart < scratchTerm.length || newSuffixStart == 0; + + if (prevTerm.length > newSuffixStart && + isNonBMPChar(prevTerm.bytes, newSuffixStart) && + isHighBMPChar(scratchTerm.bytes, newSuffixStart)) { + + // Seek type 2 -- put 0xFF at this position: + scratchTerm.bytes[newSuffixStart] = (byte) 0xff; + scratchTerm.length = newSuffixStart+1; + if (DEBUG_SURROGATES) { - System.out.println(" seekPrefix=" + seekPrefix); + System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString()); } - if (newSuffixStart < seekPrefix) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); + + // TODO: more efficient seek? can we simply swap + // the enums? + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); + + final Term t2 = termEnum.term(); + + // We could hit EOF or different field since this + // was a seek "forward": + if (t2 != null && t2.field() == fieldInfo.name) { + if (DEBUG_SURROGATES) { - System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes()); } - getTermsDict().seekEnum(termEnum, t2); - surrogateDidSeekBack[surrogateSeekUpto-1] = true; - // +2 because we don't want to re-check the - // surrogates we just seek'd back to - newSuffixStart = seekPrefix + 2; + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + + // Set newSuffixStart -- we can't use + // termEnum's since the above seek may have + // done no scanning (eg, term was precisely + // and index term, or, was in the term seek + // cache): + scratchTerm.copy(b2); + setNewSuffixStart(prevTerm, scratchTerm); + return true; - } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); + } else if (newSuffixStart != 0 || scratchTerm.length != 0) { if (DEBUG_SURROGATES) { - System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); + System.out.println(" got term=null (or next field)"); } - getTermsDict().seekEnum(termEnum, t2); - if (DEBUG_SURROGATES) { - System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - surrogateSeekUpto--; - - if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { - // force pop - newSuffixStart = -1; - } else { - newSuffixStart = termEnum.newSuffixStart; - } - + newSuffixStart = 0; + scratchTerm.length = 0; return true; } } @@ -358,117 +482,249 @@ return false; } - private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); + // Pre-flex indices store terms in UTF16 sort order, but + // certain queries require Unicode codepoint order; this + // method carefully seeks around surrogates to handle + // this impedance mismatch + + private void surrogateDance() throws IOException { + + if (!unicodeSortOrder) { + return; + } + + // We are invoked after TIS.next() (by UTF16 order) to + // possibly seek to a different "next" (by unicode + // order) term. + + // We scan only the "delta" from the last term to the + // current term, in UTF8 bytes. We look at 1) the bytes + // stripped from the prior term, and then 2) the bytes + // appended to that prior term's prefix. - private boolean pushNewSurrogate() throws IOException { + // We don't care about specific UTF8 sequences, just + // the "category" of the UTF16 character. Category S + // is a high/low surrogate pair (it non-BMP). + // Category E is any BMP char > UNI_SUR_LOW_END (and < + // U+FFFF). Category A is the rest (any unicode char + // <= UNI_SUR_HIGH_START). + + // The core issue is that pre-flex indices sort the + // characters as ASE, while flex must sort as AES. So + // when scanning, when we hit S, we must 1) seek + // forward to E and enum the terms there, then 2) seek + // back to S and enum all terms there, then 3) seek to + // after E. Three different seek points (1, 2, 3). + + // We can easily detect S in UTF8: if a byte has + // prefix 11110 (0xf0), then that byte and the + // following 3 bytes encode a single unicode codepoint + // in S. Similary,we can detect E: if a byte has + // prefix 1110111 (0xee), then that byte and the + // following 2 bytes encode a single unicode codepoint + // in E. + + // Note that this is really a recursive process -- + // maybe the char at pos 2 needs to dance, but any + // point in its dance, suddenly pos 4 needs to dance + // so you must finish pos 4 before returning to pos + // 2. But then during pos 4's dance maybe pos 7 needs + // to dance, etc. However, despite being recursive, + // we don't need to hold any state because the state + // can always be derived by looking at prior term & + // current term. + + // TODO: can we avoid this copy? + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + scratchTerm.length = 0; + } else { + scratchTerm.copy(termEnum.term().bytes()); + } + if (DEBUG_SURROGATES) { - System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); + System.out.println(" dance"); + System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString())); + System.out.println(" " + prevTerm.toString()); + System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString())); + System.out.println(" " + scratchTerm.toString()); } - final Term t = termEnum.term(); - if (t == null || t.field() != fieldInfo.name) { - return false; + + // This code assumes TermInfosReader/SegmentTermEnum + // always use BytesRef.offset == 0 + assert prevTerm.offset == 0; + assert scratchTerm.offset == 0; + + // Need to loop here because we may need to do multiple + // pops, and possibly a continue in the end, ie: + // + // cont + // pop, cont + // pop, pop, cont + // + // + + while(true) { + if (doContinue()) { + break; + } else { + if (!doPop()) { + break; + } + } } - final BytesRef bytes = t.bytes(); - UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer); + if (DEBUG_SURROGATES) { + System.out.println(" finish bmp ends"); + } - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { + doPushes(); + } + + // Look for seek type 1 ("push"): if the newly added + // suffix contains any S, we must try to seek to the + // corresponding E. If we find a match, we go there; + // else we keep looking for additional S's in the new + // suffix. This "starts" the dance, at this character + // position: + private void doPushes() throws IOException { + + int upTo = newSuffixStart; + if (DEBUG_SURROGATES) { + System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length); + } + + while(upTo < scratchTerm.length) { + if (isNonBMPChar(scratchTerm.bytes, upTo) && + (upTo > newSuffixStart || + (upTo >= prevTerm.length || + (!isNonBMPChar(prevTerm.bytes, upTo) && + !isHighBMPChar(prevTerm.bytes, upTo))))) { + + // A non-BMP char (4 bytes UTF8) starts here: + assert scratchTerm.length >= upTo + 4; + + final int savLength = scratchTerm.length; + scratch[0] = scratchTerm.bytes[upTo]; + scratch[1] = scratchTerm.bytes[upTo+1]; + scratch[2] = scratchTerm.bytes[upTo+2]; + + scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD; + scratchTerm.bytes[upTo+1] = (byte) 0x80; + scratchTerm.bytes[upTo+2] = (byte) 0x80; + scratchTerm.length = upTo+3; + if (DEBUG_SURROGATES) { - System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); + System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length); } - // the next() that we just did read in a new - // suffix, containing a surrogate pair + // Seek "forward": + // TODO: more efficient seek? + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); - // seek forward to see if there are any terms with - // this same prefix, but with characters after the - // surrogate range; if so, we must first iterate - // them, then seek back to the surrogates + scratchTerm.bytes[upTo] = scratch[0]; + scratchTerm.bytes[upTo+1] = scratch[1]; + scratchTerm.bytes[upTo+2] = scratch[2]; + scratchTerm.length = savLength; - char[] testPrefix = new char[i+2]; - for(int j=0;j= upTo+3 && isHighBMPChar(b2.bytes, upTo)) { + matches = true; + for(int i=0;i BMP + upTo += 3; + + // NOTE: we keep iterating, now, since this + // can easily "recurse". Ie, after seeking + // forward at a certain char position, we may + // find another surrogate in our [new] suffix + // and must then do another seek (recurse) } else { - // there are no terms after the surrogates, so - // we do nothing to the enum and just step - // through the surrogates like normal. but we - // must keep iterating through the term, in case - // another surrogate pair appears later + upTo++; } + } else { + upTo++; } } + } - return false; + private boolean unicodeSortOrder; + + void reset(FieldInfo fieldInfo) throws IOException { + //System.out.println("pff.reset te=" + termEnum); + this.fieldInfo = fieldInfo; + protoTerm = new Term(fieldInfo.name); + if (termEnum == null) { + termEnum = getTermsDict().terms(protoTerm); + seekTermEnum = getTermsDict().terms(protoTerm); + //System.out.println(" term=" + termEnum.term()); + } else { + getTermsDict().seekEnum(termEnum, protoTerm); + } + skipNext = true; + + unicodeSortOrder = sortTermsByUnicode(); + + final Term t = termEnum.term(); + if (t != null && t.field() == fieldInfo.name) { + newSuffixStart = 0; + prevTerm.length = 0; + surrogateDance(); + } } @Override public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } @Override @@ -484,7 +740,7 @@ @Override public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { if (DEBUG_SURROGATES) { - System.out.println("TE.seek() term=" + term.utf8ToString()); + System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } skipNext = false; final TermInfosReader tis = getTermsDict(); @@ -492,50 +748,142 @@ assert termEnum != null; - if (termEnum == null) { - termEnum = tis.terms(t0); - } else { - tis.seekEnum(termEnum, t0); - } + tis.seekEnum(termEnum, t0); - surrogateSeekUpto = 0; - surrogatesDance(); - final Term t = termEnum.term(); - final BytesRef tr = t == null ? null : t.bytes(); - - if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) { - current = tr; + if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) { + // If we found an exact match, no need to do the + // surrogate dance + if (DEBUG_SURROGATES) { + System.out.println(" seek exact match"); + } + current = t.bytes(); return SeekStatus.FOUND; } else if (t == null || t.field() != fieldInfo.name) { + + // TODO: maybe we can handle this like the next() + // into null? set term as prevTerm then dance? + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit EOF"); + } + + // We hit EOF; try end-case surrogate dance: if we + // find an E, try swapping in S, backwards: + scratchTerm.copy(term); + + assert scratchTerm.offset == 0; + + for(int i=scratchTerm.length-1;i>=0;i--) { + if (isHighBMPChar(scratchTerm.bytes, i)) { + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + i + "; try seek"); + } + + if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { + + scratchTerm.copy(seekTermEnum.term().bytes()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + + newSuffixStart = 1+i; + + doPushes(); + + // Found a match + // TODO: faster seek? + current = termEnum.term().bytes(); + return SeekStatus.NOT_FOUND; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" seek END"); + } + current = null; return SeekStatus.END; } else { - current = tr; - return SeekStatus.NOT_FOUND; + + // We found a non-exact but non-null term; this one + // is fun -- just treat it like next, by pretending + // requested term was prev: + prevTerm.copy(term); + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text())); + } + + final BytesRef br = t.bytes(); + assert br.offset == 0; + + setNewSuffixStart(term, br); + + surrogateDance(); + + final Term t2 = termEnum.term(); + if (t2 == null || t2.field() != fieldInfo.name) { + assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned + current = null; + return SeekStatus.END; + } else { + current = t2.bytes(); + assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString()); + return SeekStatus.NOT_FOUND; + } } } + private void setNewSuffixStart(BytesRef br1, BytesRef br2) { + final int limit = Math.min(br1.length, br2.length); + int lastStart = 0; + for(int i=0;i getAllExtensions() { return knownExtensions; @@ -111,8 +125,5 @@ @Override public Codec getWriter(SegmentWriteState state) { return lookup(CodecProvider.getDefaultCodec()); - //return lookup("Pulsing"); - //return lookup("Sep"); - //return lookup("IntBlock"); } -} \ No newline at end of file +} Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 979430) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -331,12 +331,17 @@ // We know the terms are not equal, but, we may // have to carefully fixup the bytes at the // difference to match UTF16's sort order: + + // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff, + // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences] + // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such + // that 6-byte sequences are needed we have much bigger problems anyway. if (aByte >= 0xee && bByte >= 0xee) { if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; + aByte += 0xe; } if ((bByte&0xfe) == 0xee) { - bByte += 0x10; + bByte += 0xe; } } return aByte - bByte; @@ -346,10 +351,6 @@ // One is a prefix of the other, or, they are equal: return a.length - b.length; } - - public boolean equals(Object other) { - return this == other; - } } public void writeExternal(ObjectOutput out) Property changes on: lucene\build.xml ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/build.xml:r967125-979432 Property changes on: lucene\contrib ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/contrib:r967125-979432 Property changes on: lucene\contrib\CHANGES.txt ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/contrib/CHANGES.txt:r967125-979432 Property changes on: lucene\contrib\instantiated\src\test\org\apache\lucene\store\instantiated\TestIndicesEquals.java ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:r967125-979432 Property changes on: lucene\contrib\highlighter\src\test ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/preflexfixes/lucene/contrib/highlighter/src/test:r967125-979432 Index: lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java =================================================================== --- lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (revision 979430) +++ lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (working copy) @@ -33,6 +33,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; @@ -107,8 +108,8 @@ RAMDirectory ramdir = new RAMDirectory(); Analyzer analyzer = randomAnalyzer(); - IndexWriter writer = new IndexWriter(ramdir, analyzer, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter(ramdir, + new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (working copy) @@ -28,7 +28,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -46,8 +45,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); // Add series of docs with specific information for MoreLikeThis addDoc(writer, "lucene"); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (working copy) @@ -20,11 +20,9 @@ import java.io.IOException; import java.util.HashSet; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsEnum; @@ -44,8 +42,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); //Add series of docs with filterable fields : url, text and dates flags addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java (working copy) @@ -19,11 +19,9 @@ import java.util.HashSet; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -53,8 +51,7 @@ public void testMissingTerms() throws Exception { String fieldName="field1"; RAMDirectory rd=new RAMDirectory(); - RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd); for (int i = 0; i < 100; i++) { Document doc=new Document(); int term=i*10; //terms are units of 10; Index: lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java (working copy) @@ -21,11 +21,9 @@ import java.util.GregorianCalendar; import java.util.Random; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -63,9 +61,7 @@ super.setUp(); random = newRandom(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); - + RandomIndexWriter writer = new RandomIndexWriter(random, directory); Calendar cal = new GregorianCalendar(); cal.clear(); cal.setTimeInMillis(1041397200000L); // 2003 January 01 @@ -200,8 +196,7 @@ public void testWithCachingFilter() throws Exception { Directory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); IndexReader reader = writer.getReader(); writer.close(); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java (working copy) @@ -18,13 +18,13 @@ */ import java.io.IOException; +import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -38,8 +38,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new MockAnalyzer(MockTokenizer.WHITESPACE, false)); //Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags addDoc(writer, "admin guest", "010", "20040101","Y"); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java (working copy) @@ -20,10 +20,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; @@ -44,8 +42,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); Document doc = new Document(); doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); Index: lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java (revision 979430) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java (working copy) @@ -25,7 +25,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -41,8 +40,7 @@ protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); //Add series of docs with misspelt names addDoc(writer, "jonathon smythe","1");