Property changes on: .
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes:r967125-979432
Property changes on: solr
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/solr:r967125-979432
Index: solr/common-build.xml
===================================================================
--- solr/common-build.xml (revision 979430)
+++ solr/common-build.xml (working copy)
@@ -44,6 +44,8 @@
+
+
+
+
Index: lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (working copy)
@@ -27,7 +27,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -88,8 +87,7 @@
int terms = (int) Math.pow(2, bits);
RAMDirectory dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false)));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false));
Document doc = new Document();
Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
Index: lucene/src/test/org/apache/lucene/search/TestNot.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestNot.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestNot.java (working copy)
@@ -20,7 +20,6 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.store.RAMDirectory;
@@ -40,8 +39,7 @@
public void testNot() throws Exception {
RAMDirectory store = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store);
Document d1 = new Document();
d1.add(new Field("field", "a b", Field.Store.YES, Field.Index.ANALYZED));
Index: lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java (working copy)
@@ -24,7 +24,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.TimeLimitingCollector.TimeExceededException;
@@ -78,8 +77,7 @@
"blueberry pizza",
};
directory = new RAMDirectory();
- RandomIndexWriter iw = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter iw = new RandomIndexWriter(newRandom(), directory);
for (int i=0; i> docs = new ArrayList>();
Document d = new Document();
Field f = new Field("f", "", Field.Store.NO, Field.Index.ANALYZED);
Index: lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy)
@@ -35,7 +35,6 @@
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
@@ -91,8 +90,7 @@
}
};
Directory store = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store,
- new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, analyzer);
Document d = new Document();
d.add(new Field("field", "bogus", Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(d);
@@ -242,8 +240,7 @@
public void testPayloadsPos0() throws Exception {
Directory dir = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, new TestPayloadAnalyzer());
Document doc = new Document();
doc.add(new Field("content", new StringReader(
"a a b c d e a f g h i j a b k k")));
Index: lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestBooleanMinShouldMatch.java (working copy)
@@ -20,11 +20,9 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
@@ -60,8 +58,7 @@
};
index = new RAMDirectory();
- RandomIndexWriter w = new RandomIndexWriter(rnd, index, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter w = new RandomIndexWriter(rnd, index);
for (int i = 0; i < data.length; i++) {
Document doc = new Document();
Index: lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java (working copy)
@@ -25,8 +25,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
@@ -128,8 +126,7 @@
query.setSlop(slop);
RAMDirectory ramDir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, ramDir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
+ RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, new MockAnalyzer(MockTokenizer.WHITESPACE, false));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
Index: lucene/src/test/org/apache/lucene/search/TestBooleanOr.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestBooleanOr.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestBooleanOr.java (working copy)
@@ -20,11 +20,9 @@
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -143,8 +141,7 @@
Random random = newRandom();
//
- RandomIndexWriter writer = new RandomIndexWriter(random, dir, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir);
//
Document d = new Document();
Index: lucene/src/test/org/apache/lucene/search/TestDateSort.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestDateSort.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestDateSort.java (working copy)
@@ -26,7 +26,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
@@ -53,8 +52,7 @@
super.setUp();
// Create an index writer.
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
// oldest doc:
// Add the first document. text = "Document 1" dateTime = Oct 10 03:25:22 EDT 2007
Index: lucene/src/test/org/apache/lucene/search/TestSort.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSort.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestSort.java (working copy)
@@ -112,8 +112,7 @@
private Searcher getIndex (boolean even, boolean odd)
throws IOException {
RAMDirectory indexStore = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
for (int i=0; i it's like having a big hairy scary monster in the basement but being upset that it doesn't have fangs
- RandomIndexWriter writer = new RandomIndexWriter(random, dir, new IndexWriterConfig(TEST_VERSION_CURRENT,
- new MockAnalyzer(MockTokenizer.KEYWORD, false)));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false));
Document doc = new Document();
Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field);
-
+ List terms = new ArrayList();
for (int i = 0; i < 2000*_TestUtil.getRandomMultiplier(); i++) {
- field.setValue(_TestUtil.randomUnicodeString(random));
+ String s = _TestUtil.randomUnicodeString(random);
+ field.setValue(s);
+ terms.add(s);
writer.addDocument(doc);
}
+
+ if (VERBOSE) {
+ // utf16 order
+ Collections.sort(terms);
+ System.out.println("UTF16 order:");
+ for(String s : terms) {
+ System.out.println(" " + UnicodeUtil.toHexString(s));
+ }
+ }
+
reader = writer.getReader();
searcher = new IndexSearcher(reader);
writer.close();
@@ -122,8 +135,11 @@
/** test a bunch of random regular expressions */
public void testRegexps() throws Exception {
- for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++)
- assertSame(AutomatonTestUtil.randomRegexp(random).toString());
+
+ for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) {
+ String reg = AutomatonTestUtil.randomRegexp(random).toString();
+ assertSame(reg);
+ }
}
/** check that the # of hits is the same as from a very
Index: lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java (working copy)
@@ -20,11 +20,9 @@
import java.io.IOException;
import java.util.Arrays;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
@@ -48,8 +46,7 @@
public void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
Document doc = new Document();
doc.add(new Field(FN,
"the quick brown fox jumps over the lazy ??? dog 493432 49344",
Index: lucene/src/test/org/apache/lucene/search/TestDateFilter.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestDateFilter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestDateFilter.java (working copy)
@@ -18,12 +18,10 @@
*/
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -47,8 +45,7 @@
public void testBefore() throws IOException {
// create an index
RAMDirectory indexStore = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
long now = System.currentTimeMillis();
@@ -114,8 +111,7 @@
public void testAfter() throws IOException {
// create an index
RAMDirectory indexStore = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
long now = System.currentTimeMillis();
Index: lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (working copy)
@@ -22,13 +22,13 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.text.DecimalFormat;
+import java.util.Random;
import java.io.IOException;
/**
@@ -80,8 +80,9 @@
super.setUp();
index = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), index,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+ Random random = newRandom();
+ RandomIndexWriter writer = new RandomIndexWriter(random, index,
+ newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer())
.setSimilarity(sim));
// hed is the most important field, dek is secondary
Index: lucene/src/test/org/apache/lucene/search/TestSimilarity.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSimilarity.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestSimilarity.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.Collection;
+import java.util.Random;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
@@ -65,8 +66,9 @@
public void testSimilarity() throws Exception {
RAMDirectory store = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+ Random random = newRandom();
+ RandomIndexWriter writer = new RandomIndexWriter(random, store,
+ newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer())
.setSimilarity(new SimpleSimilarity()));
Document d1 = new Document();
Index: lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestTopScoreDocCollector.java (working copy)
@@ -19,10 +19,8 @@
import java.util.Random;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
@@ -42,8 +40,7 @@
Directory dir = new RAMDirectory();
Random random = newRandom();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir);
for (int i = 0; i < 10; i++) {
writer.addDocument(new Document());
}
Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java (working copy)
@@ -51,8 +51,7 @@
super.setUp();
random = newRandom();
dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir);
Document doc = new Document();
Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
Index: lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestSpanQueryFilter.java (working copy)
@@ -18,11 +18,9 @@
import java.util.List;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanTermQuery;
@@ -40,8 +38,7 @@
public void testFilterWorks() throws Exception {
Directory dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
for (int i = 0; i < 500; i++) {
Document document = new Document();
document.add(new Field("field", English.intToEnglish(i) + " equals " + English.intToEnglish(i),
Index: lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java (working copy)
@@ -25,7 +25,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.RandomIndexWriter;
@@ -51,8 +50,7 @@
random = newRandom();
dir = new MockRAMDirectory();
// TODO: fix mocktokenizer to not extend chartokenizer, so you can have an 'empty' keyword.
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false)));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false));
Document doc = new Document();
Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
Index: lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestCustomSearcherSort.java (working copy)
@@ -24,12 +24,10 @@
import java.util.Random;
import java.util.TreeMap;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
@@ -59,8 +57,7 @@
super.setUp();
Random rand = newRandom();
index = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(rand, index,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(rand, index);
RandomGen random = new RandomGen(rand);
for (int i = 0; i < INDEX_SIZE; ++i) { // don't decrease; if to low the
// problem doesn't show up
Index: lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java (working copy)
@@ -18,11 +18,9 @@
*/
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -48,8 +46,7 @@
protected void setUp() throws Exception {
super.setUp();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
for (int i = 0; i < 5137; ++i) {
Document doc = new Document();
Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery32.java (working copy)
@@ -55,8 +55,7 @@
public static void beforeClass() throws Exception {
directory = new RAMDirectory();
Random random = newStaticRandom(TestNumericRangeQuery32.class);
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory);
NumericField
field8 = new NumericField("field8", 8, Field.Store.YES, true),
Index: lucene/src/test/org/apache/lucene/search/TestDocBoost.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestDocBoost.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestDocBoost.java (working copy)
@@ -20,10 +20,8 @@
import java.io.IOException;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -40,8 +38,7 @@
public void testDocBoost() throws Exception {
RAMDirectory store = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store);
Fieldable f1 = new Field("field", "word", Field.Store.YES, Field.Index.ANALYZED);
Fieldable f2 = new Field("field", "word", Field.Store.YES, Field.Index.ANALYZED);
Index: lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (working copy)
@@ -54,8 +54,7 @@
public static void beforeClass() throws Exception {
directory = new RAMDirectory();
Random random = newStaticRandom(TestNumericRangeQuery64.class);
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory);
NumericField
field8 = new NumericField("field8", 8, Field.Store.YES, true),
Index: lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java (working copy)
@@ -20,10 +20,8 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -38,8 +36,7 @@
String[] categories = new String[] {"/Computers",
"/Computers/Mac",
"/Computers/Windows"};
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
for (int i = 0; i < categories.length; i++) {
Document doc = new Document();
doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
Index: lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (working copy)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
@@ -25,7 +24,6 @@
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.store.MockRAMDirectory;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -46,8 +44,7 @@
public void testPhrasePrefix() throws IOException {
MockRAMDirectory indexStore = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
add("blueberry pie", writer);
add("blueberry strudel", writer);
add("blueberry pizza", writer);
@@ -152,8 +149,7 @@
// The contained PhraseMultiQuery must contain exactly one term array.
MockRAMDirectory indexStore = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
add("blueberry pie", writer);
add("blueberry chewing gum", writer);
add("blue raspberry pie", writer);
@@ -185,8 +181,7 @@
public void testPhrasePrefixWithBooleanQuery() throws IOException {
MockRAMDirectory indexStore = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
add("This is a test", "object", writer);
add("a note", "note", writer);
@@ -214,8 +209,7 @@
public void testNoDocs() throws Exception {
MockRAMDirectory indexStore = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), indexStore);
add("a note", "note", writer);
IndexReader reader = writer.getReader();
Index: lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestBooleanPrefixQuery.java (working copy)
@@ -22,11 +22,9 @@
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.PrefixQuery;
@@ -77,8 +75,7 @@
Query rw1 = null;
Query rw2 = null;
IndexReader reader = null;
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
for (int i = 0; i < categories.length; i++) {
Document doc = new Document();
doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
Index: lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestFilteredQuery.java (working copy)
@@ -17,11 +17,9 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
@@ -50,8 +48,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter (newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter (newRandom(), directory);
Document doc = new Document();
doc.add (new Field("field", "one two three four five", Field.Store.YES, Field.Index.ANALYZED));
@@ -73,6 +70,11 @@
doc.add (new Field("sorter", "c", Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument (doc);
+ // tests here require single segment (eg try seed
+ // 8239472272678419952L), because SingleDocTestFilter(x)
+ // blindly accepts that docID in any sub-segment
+ writer.optimize();
+
reader = writer.getReader();
writer.close ();
Index: lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (working copy)
@@ -19,11 +19,9 @@
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.MockRAMDirectory;
@@ -39,8 +37,7 @@
public void testMissingTerms() throws Exception {
String fieldName = "field1";
MockRAMDirectory rd = new MockRAMDirectory();
- RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
int term = i * 10; //terms are units of 10;
Index: lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java (working copy)
@@ -22,10 +22,8 @@
import java.util.Locale;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.RAMDirectory;
@@ -401,8 +399,7 @@
/* build an index */
RAMDirectory farsiIndex = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(rand, farsiIndex,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(rand, farsiIndex);
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES,
Field.Index.NOT_ANALYZED));
@@ -442,8 +439,7 @@
/* build an index */
RAMDirectory danishIndex = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(rand, danishIndex,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(rand, danishIndex);
// Danish collation orders the words below in the given order
// (example taken from TestSort.testInternationalSort() ).
String[] words = {"H\u00D8T", "H\u00C5T", "MAND"};
Index: lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy)
@@ -21,6 +21,7 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -64,7 +65,7 @@
public void testPrevTermAtEnd() throws IOException
{
Directory dir = new MockRAMDirectory();
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
addDoc(writer, "aaa bbb");
writer.close();
SegmentReader reader = SegmentReader.getOnlySegmentReader(dir);
Index: lucene/src/test/org/apache/lucene/index/TestRollback.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestRollback.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestRollback.java (working copy)
@@ -31,7 +31,7 @@
// LUCENE-2536
public void testRollbackIntegrityWithBufferFlush() throws Exception {
Directory dir = new MockRAMDirectory();
- RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir);
for (int i = 0; i < 5; i++) {
Document doc = new Document();
doc.add(new Field("pk", Integer.toString(i), Store.YES, Index.ANALYZED_NO_NORMS));
Index: lucene/src/test/org/apache/lucene/index/TestIndexReader.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexReader.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestIndexReader.java (working copy)
@@ -1675,7 +1675,7 @@
// LUCENE-1586: getUniqueTermCount
public void testUniqueTermCount() throws Exception {
Directory dir = new MockRAMDirectory();
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@@ -1708,7 +1708,7 @@
// LUCENE-1609: don't load terms index
public void testNoTermsIndex() throws Throwable {
Directory dir = new MockRAMDirectory();
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@@ -1725,7 +1725,7 @@
}
assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor());
- writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
writer.addDocument(doc);
writer.close();
Index: lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy)
@@ -68,10 +68,10 @@
// TODO: verify equals using IW.getReader
DocsAndWriter dw = indexRandomIWReader(5, 3, 100, dir);
- IndexReader r = dw.writer.getReader();
+ IndexReader reader = dw.writer.getReader();
dw.writer.commit();
- verifyEquals(r, dir, "id");
- r.close();
+ verifyEquals(r, reader, dir, "id");
+ reader.close();
dw.writer.close();
dir.close();
}
@@ -261,8 +261,8 @@
w.close();
}
- public static void verifyEquals(IndexReader r1, Directory dir2, String idField) throws Throwable {
- IndexReader r2 = IndexReader.open(dir2, true);
+ public static void verifyEquals(Random r, IndexReader r1, Directory dir2, String idField) throws Throwable {
+ IndexReader r2 = IndexReader.open(dir2);
verifyEquals(r1, r2, idField);
r2.close();
}
Index: lucene/src/test/org/apache/lucene/index/TestFlex.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestFlex.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestFlex.java (working copy)
@@ -20,6 +20,8 @@
import java.io.*;
import java.util.*;
import org.apache.lucene.store.*;
+import org.apache.lucene.index.codecs.*;
+import org.apache.lucene.index.codecs.standard.*;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
@@ -64,7 +66,8 @@
public void testTermOrd() throws Exception {
Directory d = new MockRAMDirectory();
- IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+ IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);
Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -4559,7 +4559,7 @@
dir.close();
}
- // LUCENE-2095: make sure with multiple threads commit
+ // LUCENE-2095: make sure with multiple threads commit
// doesn't return until all changes are in fact in the
// index
public void testCommitThreadSafety() throws Throwable {
@@ -4670,19 +4670,19 @@
}
// Make sure terms, including ones with surrogate pairs,
- // sort in UTF16 sort order by default
+ // sort in codepoint sort order by default
public void testTermUTF16SortOrder() throws Throwable {
+ Random rnd = newRandom();
Directory dir = new MockRAMDirectory();
- IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+ RandomIndexWriter writer = new RandomIndexWriter(rnd, dir);
Document d = new Document();
// Single segment
Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
d.add(f);
char[] chars = new char[2];
- Random rnd = newRandom();
final Set allTerms = new HashSet();
- for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) {
+ for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) {
final String s;
if (rnd.nextBoolean()) {
@@ -4705,14 +4705,13 @@
allTerms.add(s);
f.setValue(s);
- //System.out.println("add " + termDesc(s));
writer.addDocument(d);
if ((1+i) % 42 == 0) {
writer.commit();
}
}
-
+
IndexReader r = writer.getReader();
// Test each sub-segment
Index: lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (working copy)
@@ -17,20 +17,18 @@
* limitations under the License.
*/
-import java.util.Random;
import java.io.Closeable;
import java.io.IOException;
+import java.util.Random;
-import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
-import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
-import org.apache.lucene.index.codecs.intblock.IntBlockCodec;
-import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
-import org.apache.lucene.index.codecs.pulsing.PulsingCodec;
-import org.apache.lucene.index.codecs.sep.SepCodec;
-import org.apache.lucene.index.codecs.standard.StandardCodec;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCaseJ4;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
/** Silly class that randomizes the indexing experience. EG
* it may swap in a different merge policy/scheduler; may
@@ -45,32 +43,48 @@
int docCount;
int flushAt;
+ // Randomly calls Thread.yield so we mixup thread scheduling
+ private static final class MockIndexWriter extends IndexWriter {
+
+ private final Random r;
+
+ public MockIndexWriter(Random r,Directory dir, IndexWriterConfig conf) throws IOException {
+ super(dir, conf);
+ this.r = r;
+ }
+
+ @Override
+ boolean testPoint(String name) {
+ if (r.nextInt(4) == 2)
+ Thread.yield();
+ return true;
+ }
+ }
+
+ /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT and MockAnalyzer */
+ public RandomIndexWriter(Random r, Directory dir) throws IOException {
+ this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, new MockAnalyzer()));
+ }
+
+ /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT */
+ public RandomIndexWriter(Random r, Directory dir, Analyzer a) throws IOException {
+ this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, a));
+ }
+
+ /** create a RandomIndexWriter with a random config */
+ public RandomIndexWriter(Random r, Directory dir, Version v, Analyzer a) throws IOException {
+ this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, v, a));
+ }
+
+ /** create a RandomIndexWriter with the provided config */
public RandomIndexWriter(Random r, Directory dir, IndexWriterConfig c) throws IOException {
this.r = r;
- if (r.nextBoolean()) {
- c.setMergePolicy(new LogDocMergePolicy());
+ w = new MockIndexWriter(r, dir, c);
+ flushAt = _TestUtil.nextInt(r, 10, 1000);
+ if (LuceneTestCaseJ4.VERBOSE) {
+ System.out.println("RIW config=" + w.getConfig());
+ System.out.println("codec default=" + CodecProvider.getDefaultCodec());
}
- if (r.nextBoolean()) {
- c.setMergeScheduler(new SerialMergeScheduler());
- }
- if (r.nextBoolean()) {
- c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000));
- }
- if (r.nextBoolean()) {
- c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000));
- }
-
- if (c.getMergePolicy() instanceof LogMergePolicy) {
- LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy();
- logmp.setUseCompoundDocStore(r.nextBoolean());
- logmp.setUseCompoundFile(r.nextBoolean());
- logmp.setCalibrateSizeByDeletes(r.nextBoolean());
- }
-
- c.setReaderPooling(r.nextBoolean());
- c.setCodecProvider(new RandomCodecProvider(r));
- w = new IndexWriter(dir, c);
- flushAt = _TestUtil.nextInt(r, 10, 1000);
}
public void addDocument(Document doc) throws IOException {
@@ -89,14 +103,27 @@
w.deleteDocuments(term);
}
+ public void commit() throws CorruptIndexException, IOException {
+ w.commit();
+ }
+
public int maxDoc() {
return w.maxDoc();
}
public IndexReader getReader() throws IOException {
- if (r.nextBoolean()) {
+ // If we are writing with PreFlexRW, force a full
+ // IndexReader.open so terms are sorted in codepoint
+ // order during searching:
+ if (!w.codecs.getWriter(null).name.equals("PreFlex") && r.nextBoolean()) {
+ if (LuceneTestCaseJ4.VERBOSE) {
+ System.out.println("RIW.getReader: use NRT reader");
+ }
return w.getReader();
} else {
+ if (LuceneTestCaseJ4.VERBOSE) {
+ System.out.println("RIW.getReader: open new reader");
+ }
w.commit();
return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10));
}
@@ -112,22 +139,4 @@
public void optimize() throws IOException {
w.optimize();
}
-
- class RandomCodecProvider extends CodecProvider {
- final String codec;
-
- RandomCodecProvider(Random random) {
- register(new StandardCodec());
- register(new IntBlockCodec());
- register(new PreFlexCodec());
- register(new PulsingCodec());
- register(new SepCodec());
- codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)];
- }
-
- @Override
- public Codec getWriter(SegmentWriteState state) {
- return lookup(codec);
- }
- }
}
Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestMultiFields.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java (working copy)
@@ -27,12 +27,13 @@
public void testRandom() throws Exception {
+ Random r = newRandom();
+
for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) {
Directory dir = new MockRAMDirectory();
+
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES));
- Random r = new Random();
-
Map> docs = new HashMap>();
Set deleted = new HashSet();
List terms = new ArrayList();
@@ -45,7 +46,7 @@
doc.add(id);
boolean onlyUniqueTerms = r.nextBoolean();
-
+ Set uniqueTerms = new HashSet();
for(int i=0;i 0) {
@@ -61,6 +62,7 @@
}
docs.get(term).add(i);
terms.add(term);
+ uniqueTerms.add(term);
f.setValue(s);
}
id.setValue(""+i);
@@ -75,8 +77,18 @@
}
}
+ if (VERBOSE) {
+ List termsList = new ArrayList(uniqueTerms);
+ Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
+ System.out.println("UTF16 order:");
+ for(BytesRef b : termsList) {
+ System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()));
+ }
+ }
+
IndexReader reader = w.getReader();
w.close();
+ //System.out.println("TEST reader=" + reader);
Bits delDocs = MultiFields.getDeletedDocs(reader);
for(int delDoc : deleted) {
Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy)
@@ -493,15 +493,22 @@
// Test random seek by ord:
final int idx = TestCodecs.this.nextInt(field.terms.length);
term = field.terms[idx];
- status = termsEnum.seek(idx);
- assertEquals(status, TermsEnum.SeekStatus.FOUND);
- assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
- assertEquals(term.docs.length, termsEnum.docFreq());
- if (field.omitTF) {
- this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false);
- } else {
- this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
+ try {
+ status = termsEnum.seek(idx);
+ } catch (UnsupportedOperationException uoe) {
+ // ok -- skip it
+ status = null;
}
+ if (status != null) {
+ assertEquals(status, TermsEnum.SeekStatus.FOUND);
+ assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
+ assertEquals(term.docs.length, termsEnum.docFreq());
+ if (field.omitTF) {
+ this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false);
+ } else {
+ this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
+ }
+ }
// Test seek to non-existent terms:
for(int i=0;i<100;i++) {
@@ -520,9 +527,12 @@
// Seek to each term by ord, backwards
for(int i=field.terms.length-1;i>=0;i--) {
- assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
- assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
- assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
+ try {
+ assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
+ assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
+ assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
+ } catch (UnsupportedOperationException uoe) {
+ }
}
// Seek to non-existent empty-string term
Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (working copy)
@@ -394,18 +394,18 @@
}
public void testDeletesOnDiskFull() throws IOException {
- testOperationsOnDiskFull(false);
+ doTestOperationsOnDiskFull(false);
}
public void testUpdatesOnDiskFull() throws IOException {
- testOperationsOnDiskFull(true);
+ doTestOperationsOnDiskFull(true);
}
/**
* Make sure if modifier tries to commit but hits disk full that modifier
* remains consistent and usable. Similar to TestIndexReader.testDiskFull().
*/
- private void testOperationsOnDiskFull(boolean updates) throws IOException {
+ private void doTestOperationsOnDiskFull(boolean updates) throws IOException {
Term searchTerm = new Term("content", "aaa");
int START_COUNT = 157;
@@ -700,6 +700,7 @@
try {
modifier.commit();
} catch (IOException ioe) {
+ // expected
failed = true;
}
Index: lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (working copy)
@@ -19,7 +19,6 @@
import java.io.IOException;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -139,7 +138,6 @@
setUpDirs(dir, aux);
IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
-
writer.addIndexes(new Directory[] {aux});
// Adds 10 docs, then replaces them with another 10
Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (working copy)
@@ -1,225 +0,0 @@
-package org.apache.lucene.index.codecs.preflex;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import java.io.IOException;
-import org.apache.lucene.store.*;
-import org.apache.lucene.index.*;
-import org.apache.lucene.util.*;
-
-
-/** This stores a monotonically increasing set of pairs in a
- Directory. A TermInfos can be written once, in order. */
-
-final class TermInfosWriter {
- /** The file format version, a negative number. */
- public static final int FORMAT = -3;
-
- // Changed strings to true utf8 with length-in-bytes not
- // length-in-chars
- public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
-
- // NOTE: always change this if you switch to a new format!
- public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
-
- private FieldInfos fieldInfos;
- private IndexOutput output;
- private TermInfo lastTi = new TermInfo();
- private long size;
-
- // TODO: the default values for these two parameters should be settable from
- // IndexWriter. However, once that's done, folks will start setting them to
- // ridiculous values and complaining that things don't work well, as with
- // mergeFactor. So, let's wait until a number of folks find that alternate
- // values work better. Note that both of these values are stored in the
- // segment, so that it's safe to change these w/o rebuilding all indexes.
-
- /** Expert: The fraction of terms in the "dictionary" which should be stored
- * in RAM. Smaller values use more memory, but make searching slightly
- * faster, while larger values use less memory and make searching slightly
- * slower. Searching is typically not dominated by dictionary lookup, so
- * tweaking this is rarely useful.*/
- int indexInterval = 128;
-
- /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
- * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
- * smaller indexes, greater acceleration, but fewer accelerable cases, while
- * smaller values result in bigger indexes, less acceleration and more
- * accelerable cases. More detailed experiments would be useful here. */
- int skipInterval = 16;
-
- /** Expert: The maximum number of skip levels. Smaller values result in
- * slightly smaller indexes, but slower skipping in big posting lists.
- */
- int maxSkipLevels = 10;
-
- private long lastIndexPointer;
- private boolean isIndex;
- private byte[] lastTermBytes = new byte[10];
- private int lastTermBytesLength = 0;
- private int lastFieldNumber = -1;
-
- private TermInfosWriter other;
-
- TermInfosWriter(Directory directory, String segment, FieldInfos fis,
- int interval)
- throws IOException {
- initialize(directory, segment, fis, interval, false);
- other = new TermInfosWriter(directory, segment, fis, interval, true);
- other.other = this;
- }
-
- private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
- int interval, boolean isIndex) throws IOException {
- initialize(directory, segment, fis, interval, isIndex);
- }
-
- private void initialize(Directory directory, String segment, FieldInfos fis,
- int interval, boolean isi) throws IOException {
- indexInterval = interval;
- fieldInfos = fis;
- isIndex = isi;
- output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
- output.writeInt(FORMAT_CURRENT); // write format
- output.writeLong(0); // leave space for size
- output.writeInt(indexInterval); // write indexInterval
- output.writeInt(skipInterval); // write skipInterval
- output.writeInt(maxSkipLevels); // write maxSkipLevels
- assert initUTF16Results();
- }
-
- void add(Term term, TermInfo ti) throws IOException {
- add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti);
- }
-
- // Currently used only by assert statements
- UnicodeUtil.UTF16Result utf16Result1;
- UnicodeUtil.UTF16Result utf16Result2;
-
- // Currently used only by assert statements
- private boolean initUTF16Results() {
- utf16Result1 = new UnicodeUtil.UTF16Result();
- utf16Result2 = new UnicodeUtil.UTF16Result();
- return true;
- }
-
- // Currently used only by assert statement
- private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
-
- if (lastFieldNumber != fieldNumber) {
- final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
- // If there is a field named "" (empty string) then we
- // will get 0 on this comparison, yet, it's "OK". But
- // it's not OK if two different field numbers map to
- // the same name.
- if (cmp != 0 || lastFieldNumber != -1)
- return cmp;
- }
-
- UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
- UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
- final int len;
- if (utf16Result1.length < utf16Result2.length)
- len = utf16Result1.length;
- else
- len = utf16Result2.length;
-
- for(int i=0;i, TermInfo> pair to the set.
- Term must be lexicographically greater than all previous Terms added.
- TermInfo pointers must be positive and greater than all previous.*/
- void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
- throws IOException {
-
- assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
- (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
- "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
- " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
- " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
-
- assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
- assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
-
- if (!isIndex && size % indexInterval == 0)
- other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
-
- writeTerm(fieldNumber, termBytes, termBytesLength); // write term
-
- output.writeVInt(ti.docFreq); // write doc freq
- output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
- output.writeVLong(ti.proxPointer - lastTi.proxPointer);
-
- if (ti.docFreq >= skipInterval) {
- output.writeVInt(ti.skipOffset);
- }
-
- if (isIndex) {
- output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
- lastIndexPointer = other.output.getFilePointer(); // write pointer
- }
-
- lastFieldNumber = fieldNumber;
- lastTi.set(ti);
- size++;
- }
-
- private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
- throws IOException {
-
- // TODO: UTF16toUTF8 could tell us this prefix
- // Compute prefix in common with last term:
- int start = 0;
- final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
- while(start < limit) {
- if (termBytes[start] != lastTermBytes[start])
- break;
- start++;
- }
-
- final int length = termBytesLength - start;
- output.writeVInt(start); // write shared prefix length
- output.writeVInt(length); // write delta length
- output.writeBytes(termBytes, start, length); // write delta bytes
- output.writeVInt(fieldNumber); // write field num
- if (lastTermBytes.length < termBytesLength) {
- lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
- }
- System.arraycopy(termBytes, start, lastTermBytes, start, length);
- lastTermBytesLength = termBytesLength;
- }
-
- /** Called to complete TermInfos creation. */
- void close() throws IOException {
- output.seek(4); // write size after format
- output.writeLong(size);
- output.close();
-
- if (!isIndex)
- other.close();
- }
-
-}
Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (working copy)
@@ -18,8 +18,10 @@
*/
import org.apache.lucene.store.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
-import org.apache.lucene.index.codecs.*;
+import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.util.*;
import java.util.*;
@@ -30,8 +32,6 @@
public class TestSurrogates extends LuceneTestCaseJ4 {
- // chooses from a very limited alphabet to exacerbate the
- // surrogate seeking required
private static String makeDifficultRandomUnicodeString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
@@ -44,154 +44,297 @@
if (0 == t && i < end - 1) {
// hi
- buffer[i++] = (char) 0xd800;
+ buffer[i++] = (char) (0xd800 + r.nextInt(2));
// lo
- buffer[i] = (char) 0xdc00;
+ buffer[i] = (char) (0xdc00 + r.nextInt(2));
} else if (t <= 3) {
- buffer[i] = 'a';
+ buffer[i] = (char) ('a' + r.nextInt(2));
} else if (4 == t) {
- buffer[i] = 0xe000;
+ buffer[i] = (char) (0xe000 + r.nextInt(2));
}
}
return new String(buffer, 0, end);
}
- private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException {
+ private String toHexString(Term t) {
+ return t.field() + ":" + UnicodeUtil.toHexString(t.text());
+ }
- final int numField = _TestUtil.nextInt(r, 2, 5);
+ private String getRandomString(Random r) {
+ String s;
+ if (r.nextInt(5) == 1) {
+ if (r.nextInt(3) == 1) {
+ s = makeDifficultRandomUnicodeString(r);
+ } else {
+ s = _TestUtil.randomUnicodeString(r);
+ }
+ } else {
+ s = _TestUtil.randomRealisticUnicodeString(r);
+ }
+ return s;
+ }
- List terms = new ArrayList();
+ private static class SortTermAsUTF16Comparator implements Comparator {
+ public int compare(Term o1, Term o2) {
+ return o1.compareToUTF16(o2);
+ }
+ }
- int tc = 0;
+ private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator();
- for(int f=0;f fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException {
- fieldInfos.add(field, true, false, false, false, false, false, false);
- final int numTerms = 10000*_TestUtil.getRandomMultiplier();
- for(int i=0;i= fieldTerms.size()) {
+ break;
+ }
+ term = fieldTerms.get(1+spot+i);
+ if (term.field() != field) {
+ assertNull(te.next());
+ break;
+ } else {
+ BytesRef t = te.next();
+
+ if (VERBOSE) {
+ System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
+ System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
+ }
+
+ assertEquals(term.bytes(), t);
+ }
+ }
}
- w.close();
+ }
- Collections.sort(fieldTerms);
+ private void doTestSeekDoesNotExist(Random r, int numField, List fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException {
+
+ final Map tes = new HashMap();
+
if (VERBOSE) {
- System.out.println("\nTEST: codepoint order");
- for(Term t: fieldTerms) {
- System.out.println(" " + t.field() + ":" + toHexString(t));
- }
+ System.out.println("TEST: top random seeks");
}
- dir.createOutput(segName + ".prx").close();
- dir.createOutput(segName + ".frq").close();
+ {
+ for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) {
+
+ // seek to random spot
+ String field = ("f" + r.nextInt(numField)).intern();
+ Term tx = new Term(field, getRandomString(r));
- // !!hack alert!! stuffing uniqueTermCount in as docCount
- return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
+ int spot = Arrays.binarySearch(fieldTermsArray, tx);
+
+ if (spot < 0) {
+ if (VERBOSE) {
+ System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text()));
+ }
+
+ // term does not exist:
+ TermsEnum te = tes.get(field);
+ if (te == null) {
+ te = MultiFields.getTerms(reader, field).iterator();
+ tes.put(field, te);
+ }
+
+ if (VERBOSE) {
+ System.out.println(" got enum");
+ }
+
+ spot = -spot - 1;
+
+ if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) {
+ assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes()));
+ } else {
+ assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes()));
+
+ if (VERBOSE) {
+ System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString()));
+ System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text()));
+ }
+
+ assertEquals(fieldTerms.get(spot).bytes(),
+ te.term());
+
+ // now .next() this many times:
+ int ct = _TestUtil.nextInt(r, 5, 100);
+ for(int i=0;i= fieldTerms.size()) {
+ break;
+ }
+ Term term = fieldTerms.get(1+spot+i);
+ if (term.field() != field) {
+ assertNull(te.next());
+ break;
+ } else {
+ BytesRef t = te.next();
+
+ if (VERBOSE) {
+ System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
+ System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
+ }
+
+ assertEquals(term.bytes(), t);
+ }
+ }
+
+ }
+ }
+ }
+ }
}
- private String toHexString(Term t) {
- return t.field() + ":" + UnicodeUtil.toHexString(t.text());
- }
-
+
@Test
public void testSurrogatesOrder() throws Exception {
+ Random r = newRandom();
+
Directory dir = new MockRAMDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(r,
+ dir,
+ newIndexWriterConfig(r, TEST_VERSION_CURRENT,
+ new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec())));
- Codec codec = new PreFlexCodec();
+ final int numField = _TestUtil.nextInt(r, 2, 5);
- Random r = newRandom();
- FieldInfos fieldInfos = new FieldInfos();
+ int uniqueTermCount = 0;
+
+ int tc = 0;
+
List fieldTerms = new ArrayList();
- SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
- // hack alert!!
- int uniqueTermCount = si.docCount;
+ for(int f=0;f uniqueTerms = new HashSet();
+ for(int i=0;i,Object> checkedClasses =
Collections.synchronizedMap(new WeakHashMap,Object>());
+ // saves default codec: we do this statically as many build indexes in @beforeClass
+ private static String savedDefaultCodec;
+ private static String codec;
+ private static Codec preFlexSav;
+
+ // returns current PreFlex codec
+ public static Codec installPreFlexRW() {
+ final Codec preFlex = CodecProvider.getDefault().lookup("PreFlex");
+ if (preFlex != null) {
+ CodecProvider.getDefault().unregister(preFlex);
+ }
+ CodecProvider.getDefault().register(new PreFlexRWCodec());
+ return preFlex;
+ }
+
+ // returns current PreFlex codec
+ public static void restorePreFlex(Codec preFlex) {
+ Codec preFlexRW = CodecProvider.getDefault().lookup("PreFlex");
+ if (preFlexRW != null) {
+ CodecProvider.getDefault().unregister(preFlexRW);
+ }
+ CodecProvider.getDefault().register(preFlex);
+ }
+
+ @BeforeClass
+ public static void beforeClassLuceneTestCaseJ4() {
+ savedDefaultCodec = CodecProvider.getDefaultCodec();
+ codec = _TestUtil.getTestCodec();
+ if (codec.equals("random"))
+ codec = CodecProvider.CORE_CODECS[seedRnd.nextInt(CodecProvider.CORE_CODECS.length)];
+
+ // If we're running w/ PreFlex codec we must swap in the
+ // test-only PreFlexRW codec (since core PreFlex can
+ // only read segments):
+ if (codec.equals("PreFlex")) {
+ preFlexSav = installPreFlexRW();
+ }
+
+ CodecProvider.setDefaultCodec(codec);
+ }
+
+ @AfterClass
+ public static void afterClassLuceneTestCaseJ4() {
+ // Restore read-only PreFlex codec:
+ if (codec.equals("PreFlex")) {
+ restorePreFlex(preFlexSav);
+ }
+ CodecProvider.setDefaultCodec(savedDefaultCodec);
+ }
+
// This is how we get control when errors occur.
// Think of this as start/end/success/failed
// events.
@@ -369,6 +429,34 @@
return new Random(seed);
}
+ /** create a new index writer config with random defaults */
+ public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) {
+ IndexWriterConfig c = new IndexWriterConfig(v, a);
+ if (r.nextBoolean()) {
+ c.setMergePolicy(new LogDocMergePolicy());
+ }
+ if (r.nextBoolean()) {
+ c.setMergeScheduler(new SerialMergeScheduler());
+ }
+ if (r.nextBoolean()) {
+ c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000));
+ }
+ if (r.nextBoolean()) {
+ c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000));
+ }
+
+ if (c.getMergePolicy() instanceof LogMergePolicy) {
+ LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy();
+ logmp.setUseCompoundDocStore(r.nextBoolean());
+ logmp.setUseCompoundFile(r.nextBoolean());
+ logmp.setCalibrateSizeByDeletes(r.nextBoolean());
+ logmp.setMergeFactor(_TestUtil.nextInt(r, 2, 20));
+ }
+
+ c.setReaderPooling(r.nextBoolean());
+ return c;
+ }
+
public String getName() {
return this.name;
}
@@ -392,6 +480,10 @@
System.out.println("NOTE: random static seed of testclass '" + getName() + "' was: " + staticSeed);
}
+ if (_TestUtil.getTestCodec().equals("random")) {
+ System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec);
+ }
+
if (seed != null) {
System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed);
}
@@ -404,5 +496,4 @@
private static final Random seedRnd = new Random();
private String name = "";
-
}
Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (working copy)
@@ -29,11 +29,15 @@
import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.ConcurrentMergeScheduler;
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.util.FieldCacheSanityChecker.Insanity;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.Codec;
/**
* Base class for all Lucene unit tests.
@@ -72,6 +76,10 @@
private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null;
+ private String savedDefaultCodec;
+ private String codec;
+ private Codec preFlexSav;
+
/** Used to track if setUp and tearDown are called correctly from subclasses */
private boolean setup;
@@ -110,6 +118,19 @@
ConcurrentMergeScheduler.setTestMode();
savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount();
+ savedDefaultCodec = CodecProvider.getDefaultCodec();
+
+ codec = _TestUtil.getTestCodec();
+ if (codec.equals("random"))
+ codec = CodecProvider.CORE_CODECS[seedRnd.nextInt(CodecProvider.CORE_CODECS.length)];
+
+ // If we're running w/ PreFlex codec we must swap in the
+ // test-only PreFlexRW codec (since core PreFlex can
+ // only read segments):
+ if (codec.equals("PreFlex")) {
+ preFlexSav = LuceneTestCaseJ4.installPreFlexRW();
+ }
+ CodecProvider.setDefaultCodec(codec);
}
/**
@@ -135,7 +156,12 @@
assertTrue("ensure your setUp() calls super.setUp()!!!", setup);
setup = false;
BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount);
-
+ // Restore read-only PreFlex codec:
+ if (codec.equals("PreFlex")) {
+ LuceneTestCaseJ4.restorePreFlex(preFlexSav);
+ }
+ CodecProvider.setDefaultCodec(savedDefaultCodec);
+
try {
Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler);
if (!uncaughtExceptions.isEmpty()) {
@@ -264,7 +290,12 @@
this.seed = Long.valueOf(seed);
return new Random(seed);
}
-
+
+ /** create a new index writer config with random defaults */
+ public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) {
+ return LuceneTestCaseJ4.newIndexWriterConfig(r, v, a);
+ }
+
/** Gets a resource from the classpath as {@link File}. This method should only be used,
* if a real file is needed. To get a stream, code should prefer
* {@link Class#getResourceAsStream} using {@code this.getClass()}.
@@ -284,6 +315,9 @@
seed = null;
super.runBare();
} catch (Throwable e) {
+ if (_TestUtil.getTestCodec().equals("random")) {
+ System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec);
+ }
if (seed != null) {
System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed);
}
Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy)
@@ -23,6 +23,9 @@
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.CheckIndex;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.Directory;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
@@ -129,9 +132,25 @@
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
+
int t = r.nextInt(5);
+
//buffer[i] = (char) (97 + r.nextInt(26));
+
+ /*
if (0 == t && i < end - 1) {
+ // hi
+ buffer[i++] = (char) 0xd800;
+ // lo
+ buffer[i] = (char) 0xdc00;
+ } else if (t <= 3) {
+ buffer[i] = 'a';
+ } else if (4 == t) {
+ buffer[i] = 0xe000;
+ }
+ */
+
+ if (0 == t && i < end - 1) {
// Make a surrogate pair
// High surrogate
buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff);
@@ -218,4 +237,39 @@
public static int getRandomMultiplier() {
return Integer.parseInt(System.getProperty("random.multiplier", "1"));
}
+
+ /** gets the codec to run tests with */
+ public static String getTestCodec() {
+ // by default we randomly pick a different codec for
+ // each test case (non-J4 tests) and each test class (J4
+ // tests)
+ return System.getProperty("tests.codec", "random");
+ }
+
+ public static CodecProvider alwaysCodec(final Codec c) {
+ return new CodecProvider() {
+ @Override
+ public Codec getWriter(SegmentWriteState state) {
+ return c;
+ }
+
+ @Override
+ public Codec lookup(String name) {
+ // can't do this until we fix PreFlexRW to not
+ //impersonate PreFlex:
+ if (name.equals(c.name)) {
+ return c;
+ } else {
+ return CodecProvider.getDefault().lookup(name);
+ }
+ }
+ };
+ }
+
+ /** Return a CodecProvider that can read any of the
+ * default codecs, but always writes in the specified
+ * codec. */
+ public static CodecProvider alwaysCodec(final String codec) {
+ return alwaysCodec(CodecProvider.getDefault().lookup(codec));
+ }
}
Property changes on: lucene\src\test\org\apache\lucene\util\TestAttributeSource.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/util/TestAttributeSource.java:r967125-979432
Index: lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java
===================================================================
--- lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java (working copy)
@@ -2,9 +2,7 @@
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.MockRAMDirectory;
@@ -58,8 +56,7 @@
/** add the doc to a ram index */
MockRAMDirectory dir = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
writer.addDocument(doc);
/** open a reader and fetch the document */
@@ -98,8 +95,7 @@
/** add the doc to a ram index */
MockRAMDirectory dir = new MockRAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
writer.addDocument(doc);
/** open a reader and fetch the document */
Property changes on: lucene\src\test\org\apache\lucene\document\TestNumberTools.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:r967125-979432
Property changes on: lucene\src\test\org\apache\lucene\document\TestDateTools.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestDateTools.java:r967125-979432
Index: lucene/src/test/org/apache/lucene/document/TestDocument.java
===================================================================
--- lucene/src/test/org/apache/lucene/document/TestDocument.java (revision 979430)
+++ lucene/src/test/org/apache/lucene/document/TestDocument.java (working copy)
@@ -1,8 +1,6 @@
package org.apache.lucene.document;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
@@ -155,8 +153,7 @@
*/
public void testGetValuesForIndexedDocument() throws Exception {
RAMDirectory dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
writer.addDocument(makeDocumentWithFields());
IndexReader reader = writer.getReader();
@@ -234,8 +231,7 @@
Field.Index.NOT_ANALYZED));
RAMDirectory dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
writer.addDocument(doc);
field.setValue("id2");
writer.addDocument(doc);
Property changes on: lucene\src\java\org\apache\lucene\analysis\Tokenizer.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/src/java/org/apache/lucene/analysis/Tokenizer.java:r967125-979432
Property changes on: lucene\src\java\org\apache\lucene\search\MultiTermQueryWrapperFilter.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:r967125-979432
Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy)
@@ -792,6 +792,7 @@
throws IOException {
String field = StringHelper.intern(entryKey.field);
+
Terms terms = MultiFields.getTerms(reader, field);
final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue();
Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy)
@@ -116,7 +116,7 @@
// different TermComps
final Comparator subTermComp = termsEnumIndex.termsEnum.getComparator();
if (subTermComp != null && !subTermComp.equals(termComp)) {
- throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge");
+ throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge");
}
}
Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy)
@@ -1095,7 +1095,7 @@
continue;
}
assert checkDeleteTerm(term);
-
+
if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) {
DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);
Index: lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.index.FieldsEnum;
import java.io.IOException;
+import java.io.Closeable;
/** Abstract API that consumes terms, doc, freq, prox and
* payloads postings. Concrete implementations of this
@@ -30,7 +31,7 @@
*
* @lucene.experimental
*/
-public abstract class FieldsConsumer {
+public abstract class FieldsConsumer implements Closeable {
/** Add a new field */
public abstract TermsConsumer addField(FieldInfo field) throws IOException;
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy)
@@ -84,26 +84,16 @@
format = firstInt;
// check that it is a format we can understand
- if (format > FORMAT_MINIMUM)
- throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
- if (format < FORMAT_CURRENT)
- throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+ if (format > FORMAT_MINIMUM)
+ throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+ if (format < FORMAT_CURRENT)
+ throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
size = input.readLong(); // read the size
- if(format == -1){
- if (!isIndex) {
- indexInterval = input.readInt();
- formatM1SkipInterval = input.readInt();
- }
- // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in
- // skipTo implementation of these versions
- skipInterval = Integer.MAX_VALUE;
- } else {
- indexInterval = input.readInt();
- skipInterval = input.readInt();
- maxSkipLevels = input.readInt();
- }
+ indexInterval = input.readInt();
+ skipInterval = input.readInt();
+ maxSkipLevels = input.readInt();
assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0";
assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0";
}
@@ -132,18 +122,21 @@
position = p;
termBuffer.set(t);
prevBuffer.reset();
+ //System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this);
termInfo.set(ti);
}
/** Increments the enumeration to the next element. True if one exists.*/
public final boolean next() throws IOException {
+ prevBuffer.set(termBuffer);
+ //System.out.println(" ste setPrev=" + prev() + " this=" + this);
+
if (position++ >= size - 1) {
- prevBuffer.set(termBuffer);
termBuffer.reset();
+ //System.out.println(" EOF");
return false;
}
- prevBuffer.set(termBuffer);
termBuffer.read(input, fieldInfos);
newSuffixStart = termBuffer.newSuffixStart;
@@ -168,6 +161,7 @@
if (isIndex)
indexPointer += input.readVLong(); // read index pointer
+ //System.out.println(" ste ret term=" + term());
return true;
}
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy)
@@ -18,9 +18,10 @@
*/
import java.io.IOException;
+import java.util.Comparator;
+
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.FieldInfos;
@@ -28,102 +29,65 @@
private String field;
private Term term; // cached
- private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes)
- private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
private BytesRef bytes = new BytesRef(10);
- int newSuffixStart;
+ private static final Comparator utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
- public final int compareTo(TermBuffer other) {
+ int newSuffixStart; // only valid right after .read is called
+
+ public int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
- return compareChars(text.result, text.length, other.text.result, other.text.length);
+ return utf8AsUTF16Comparator.compare(bytes, other.bytes);
else
return field.compareTo(other.field);
}
- private static int compareChars(char[] chars1, int len1,
- char[] chars2, int len2) {
- final int end = len1 < len2 ? len1:len2;
- for (int k = 0; k < end; k++) {
- char c1 = chars1[k];
- char c2 = chars2[k];
- if (c1 != c2) {
- return c1 - c2;
- }
- }
- return len1 - len2;
- }
-
- public final void read(IndexInput input, FieldInfos fieldInfos)
+ public void read(IndexInput input, FieldInfos fieldInfos)
throws IOException {
this.term = null; // invalidate cache
- int start = input.readVInt();
+ newSuffixStart = input.readVInt();
int length = input.readVInt();
- int totalLength = start + length;
+ int totalLength = newSuffixStart + length;
if (bytes.bytes.length < totalLength) {
bytes.grow(totalLength);
}
- if (dirty) {
- // Fully convert all bytes since bytes is dirty
- UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
- bytes.length = totalLength;
- input.readBytes(bytes.bytes, start, length);
- UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
- dirty = false;
- } else {
- // Incrementally convert only the UTF8 bytes that are new:
- bytes.length = totalLength;
- input.readBytes(bytes.bytes, start, length);
- UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
- }
-
- while(true) {
- newSuffixStart = text.offsets[start];
- if (newSuffixStart != -1) {
- break;
- }
- if (--start == 0) {
- newSuffixStart = 0;
- break;
- }
- }
+ bytes.length = totalLength;
+ input.readBytes(bytes.bytes, newSuffixStart, length);
this.field = fieldInfos.fieldName(input.readVInt());
}
- public final void set(Term term) {
+ public void set(Term term) {
if (term == null) {
reset();
return;
}
-
- final BytesRef termBytes = term.bytes();
- UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text);
- dirty = true;
+ bytes.copy(term.bytes());
field = term.field();
this.term = term;
}
- public final void set(TermBuffer other) {
- text.copyText(other.text);
- dirty = true;
+ public void set(TermBuffer other) {
field = other.field;
- term = other.term;
+ // dangerous to copy Term over, since the underlying
+ // BytesRef could subsequently be modified:
+ term = null;
+ bytes.copy(other.bytes);
}
public void reset() {
field = null;
- text.setLength(0);
term = null;
- dirty = true;
}
public Term toTerm() {
if (field == null) // unset
return null;
- if (term == null)
- term = new Term(field, new BytesRef(text.result, 0, text.length), false);
+ if (term == null) {
+ term = new Term(field, new BytesRef(bytes), false);
+ //term = new Term(field, bytes, false);
+ }
return term;
}
@@ -134,12 +98,7 @@
try {
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.dirty = true;
- clone.bytes = new BytesRef(10);
- clone.text = new UnicodeUtil.UTF16Result();
- clone.text.offsets = new int[text.offsets.length];
- System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
- clone.text.copyText(text);
+ clone.bytes = new BytesRef(bytes);
return clone;
}
}
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (working copy)
@@ -40,16 +40,16 @@
public class PreFlexCodec extends Codec {
/** Extension of terms file */
- static final String TERMS_EXTENSION = "tis";
+ public static final String TERMS_EXTENSION = "tis";
/** Extension of terms index file */
- static final String TERMS_INDEX_EXTENSION = "tii";
+ public static final String TERMS_INDEX_EXTENSION = "tii";
/** Extension of freq postings file */
- static final String FREQ_EXTENSION = "frq";
+ public static final String FREQ_EXTENSION = "frq";
/** Extension of prox postings file */
- static final String PROX_EXTENSION = "prx";
+ public static final String PROX_EXTENSION = "prx";
public PreFlexCodec() {
name = "PreFlex";
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (working copy)
@@ -23,30 +23,30 @@
* indexing. */
@Deprecated
-class TermInfo {
+public class TermInfo {
/** The number of documents which contain the term. */
- int docFreq = 0;
+ public int docFreq = 0;
- long freqPointer = 0;
- long proxPointer = 0;
- int skipOffset;
+ public long freqPointer = 0;
+ public long proxPointer = 0;
+ public int skipOffset;
- TermInfo() {}
+ public TermInfo() {}
- TermInfo(int df, long fp, long pp) {
+ public TermInfo(int df, long fp, long pp) {
docFreq = df;
freqPointer = fp;
proxPointer = pp;
}
- TermInfo(TermInfo ti) {
+ public TermInfo(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
skipOffset = ti.skipOffset;
}
- final void set(int docFreq,
+ public final void set(int docFreq,
long freqPointer, long proxPointer, int skipOffset) {
this.docFreq = docFreq;
this.freqPointer = freqPointer;
@@ -54,7 +54,7 @@
this.skipOffset = skipOffset;
}
- final void set(TermInfo ti) {
+ public final void set(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (working copy)
@@ -119,9 +119,12 @@
indexTerms = new Term[indexSize];
indexInfos = new TermInfo[indexSize];
indexPointers = new long[indexSize];
-
- for (int i = 0; indexEnum.next(); i++) {
+
+ for (int i=0;indexEnum.next(); i++) {
indexTerms[i] = indexEnum.term();
+ assert indexTerms[i] != null;
+ assert indexTerms[i].text() != null;
+ assert indexTerms[i].field() != null;
indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer;
@@ -160,14 +163,14 @@
return origEnum.maxSkipLevels;
}
- final void close() throws IOException {
+ void close() throws IOException {
if (origEnum != null)
origEnum.close();
threadResources.close();
}
/** Returns the number of term/value pairs in the set. */
- final long size() {
+ long size() {
return size;
}
@@ -183,12 +186,13 @@
/** Returns the offset of the greatest index entry which is less than or equal to term.*/
- private final int getIndexOffset(Term term) {
+ private int getIndexOffset(Term term) {
int lo = 0; // binary search indexTerms[]
int hi = indexTerms.length - 1;
while (hi >= lo) {
int mid = (lo + hi) >>> 1;
+ assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
int delta = term.compareToUTF16(indexTerms[mid]);
if (delta < 0)
hi = mid - 1;
@@ -200,7 +204,7 @@
return hi;
}
- private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
+ private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
enumerator.seek(indexPointers[indexOffset],
((long) indexOffset * totalIndexInterval) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]);
@@ -231,6 +235,9 @@
}
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException {
+ if (size == 0) {
+ return null;
+ }
// optimize sequential access: first try scanning cached enum w/o seeking
if (enumerator.term() != null // term is at or past current
@@ -242,7 +249,6 @@
// no need to seek
final TermInfo ti;
-
int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo();
@@ -279,6 +285,7 @@
seekEnum(enumerator, indexPos);
enumerator.scanTo(term);
final TermInfo ti;
+
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (tiOrd == null) {
@@ -294,7 +301,7 @@
}
// called only from asserts
- private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
+ private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
if (ti1.docFreq != ti2.docFreq) {
return false;
}
@@ -319,7 +326,7 @@
}
/** Returns the position of a Term in the set or -1. */
- final long getPosition(Term term) throws IOException {
+ long getPosition(Term term) throws IOException {
if (size == 0) return -1;
ensureIndexIsRead();
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy)
@@ -40,12 +40,11 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.ArrayUtil;
/** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */
public class PreFlexFields extends FieldsProducer {
-
+
private static final boolean DEBUG_SURROGATES = false;
public TermInfosReader tis;
@@ -60,7 +59,7 @@
private final int readBufferSize;
private Directory cfsReader;
- PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
+ public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
throws IOException {
si = info;
@@ -107,6 +106,15 @@
this.dir = dir;
}
+ // If this returns, we do the surrogates dance so that the
+ // terms are sorted by unicode sort order. This should be
+ // true when segments are used for "normal" searching;
+ // it's only false during testing, to create a pre-flex
+ // index, using the test-only PreFlexRW.
+ protected boolean sortTermsByUnicode() {
+ return true;
+ }
+
static void files(Directory dir, SegmentInfo info, Collection files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
@@ -182,6 +190,12 @@
if (cfsReader != null) {
cfsReader.close();
}
+ if (freqStream != null) {
+ freqStream.close();
+ }
+ if (proxStream != null) {
+ proxStream.close();
+ }
}
private class PreFlexFieldsEnum extends FieldsEnum {
@@ -228,7 +242,11 @@
public Comparator getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
- return BytesRef.getUTF8SortedAsUnicodeComparator();
+ if (sortTermsByUnicode()) {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ } else {
+ return BytesRef.getUTF8SortedAsUTF16Comparator();
+ }
}
}
@@ -238,119 +256,225 @@
private boolean skipNext;
private BytesRef current;
- private int[] surrogateSeekPending = new int[1];
- private boolean[] surrogateDidSeekBack = new boolean[1];
- private int surrogateSeekUpto;
- private char[] pendingPrefix;
-
private SegmentTermEnum seekTermEnum;
private Term protoTerm;
+
+ private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
+ private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;
+
+ // Returns true if the unicode char is "after" the
+ // surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
+ private final boolean isHighBMPChar(byte[] b, int idx) {
+ return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
+ }
+
+ // Returns true if the unicode char in the UTF8 byte
+ // sequence starting at idx encodes a char outside of
+ // BMP (ie what would be a surrogate pair in UTF16):
+ private final boolean isNonBMPChar(byte[] b, int idx) {
+ return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
+ }
+
+ private final byte[] scratch = new byte[4];
+ private final BytesRef prevTerm = new BytesRef();
+ private final BytesRef scratchTerm = new BytesRef();
private int newSuffixStart;
- void reset(FieldInfo fieldInfo) throws IOException {
- this.fieldInfo = fieldInfo;
- protoTerm = new Term(fieldInfo.name);
- if (termEnum == null) {
- termEnum = getTermsDict().terms(protoTerm);
- seekTermEnum = getTermsDict().terms(protoTerm);
+ // Swap in S, in place of E:
+ private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
+ final int savLength = term.length;
+
+ assert term.offset == 0;
+
+ // The 3 bytes starting at downTo make up 1
+ // unicode character:
+ assert isHighBMPChar(term.bytes, pos);
+
+ // NOTE: we cannot make this assert, because
+ // AutomatonQuery legitimately sends us malformed UTF8
+ // (eg the UTF8 bytes with just 0xee)
+ // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();
+
+ // Save the bytes && length, since we need to
+ // restore this if seek "back" finds no matching
+ // terms
+ if (term.bytes.length < 4+pos) {
+ term.grow(4+pos);
+ }
+
+ scratch[0] = term.bytes[pos];
+ scratch[1] = term.bytes[pos+1];
+ scratch[2] = term.bytes[pos+2];
+
+ term.bytes[pos] = (byte) 0xf0;
+ term.bytes[pos+1] = (byte) 0x90;
+ term.bytes[pos+2] = (byte) 0x80;
+ term.bytes[pos+3] = (byte) 0x80;
+ term.length = 4+pos;
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
+ }
+
+ // Seek "back":
+ getTermsDict().seekEnum(te, protoTerm.createTerm(term));
+
+ // Test if the term we seek'd to in fact found a
+ // surrogate pair at the same position as the E:
+ Term t2 = te.term();
+
+ // Cannot be null (or move to next field) because at
+ // "worst" it'd seek to the same term we are on now,
+ // unless we are being called from seek
+ if (t2 == null || t2.field() != fieldInfo.name) {
+ return false;
+ }
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()));
+ }
+
+ // Now test if prefix is identical and we found
+ // a non-BMP char at the same position:
+ BytesRef b2 = t2.bytes();
+ assert b2.offset == 0;
+
+ boolean matches;
+ if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
+ matches = true;
+ for(int i=0;i 0) {
- sb.append(' ');
+ boolean didSeek = false;
+
+ final int limit = Math.min(newSuffixStart, scratchTerm.length-1);
+
+ while(downTo > limit) {
+
+ if (isHighBMPChar(prevTerm.bytes, downTo)) {
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length);
}
- sb.append(surrogateSeekPending[i]);
+
+ if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
+ // TODO: more efficient seek?
+ getTermsDict().seekEnum(termEnum, seekTermEnum.term());
+ //newSuffixStart = downTo+4;
+ newSuffixStart = downTo;
+ scratchTerm.copy(termEnum.term().bytes());
+ didSeek = true;
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek!");
+ }
+ break;
+ } else {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" no seek");
+ }
+ }
}
- sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
- return sb.toString();
+
+ // Shorten prevTerm in place so that we don't redo
+ // this loop if we come back here:
+ if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
+ prevTerm.length = downTo;
+ }
+
+ downTo--;
}
+
+ return didSeek;
}
- private boolean popPendingSeek() throws IOException {
+ // Look for seek type 3 ("pop"): if the delta from
+ // prev -> current was replacing an S with an E,
+ // we must now seek to beyond that E. This seek
+ // "finishes" the dance at this character
+ // position.
+ private boolean doPop() throws IOException {
+
if (DEBUG_SURROGATES) {
- System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
+ System.out.println(" try pop");
}
- // if a .next() has advanced beyond the
- // after-surrogates range we had last seeked to, we
- // must seek back to the start and resume .next from
- // there. this pops the pending seek off the stack.
- final Term t = termEnum.term();
- if (surrogateSeekUpto > 0) {
- final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
+
+ assert newSuffixStart <= prevTerm.length;
+ assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;
+
+ if (prevTerm.length > newSuffixStart &&
+ isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
+ isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {
+
+ // Seek type 2 -- put 0xFF at this position:
+ scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
+ scratchTerm.length = newSuffixStart+1;
+
if (DEBUG_SURROGATES) {
- System.out.println(" seekPrefix=" + seekPrefix);
+ System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
}
- if (newSuffixStart < seekPrefix) {
- assert pendingPrefix != null;
- assert pendingPrefix.length > seekPrefix;
- pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
- pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START;
- Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix));
+
+ // TODO: more efficient seek? can we simply swap
+ // the enums?
+ getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
+
+ final Term t2 = termEnum.term();
+
+ // We could hit EOF or different field since this
+ // was a seek "forward":
+ if (t2 != null && t2.field() == fieldInfo.name) {
+
if (DEBUG_SURROGATES) {
- System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
+ System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
}
- getTermsDict().seekEnum(termEnum, t2);
- surrogateDidSeekBack[surrogateSeekUpto-1] = true;
- // +2 because we don't want to re-check the
- // surrogates we just seek'd back to
- newSuffixStart = seekPrefix + 2;
+ final BytesRef b2 = t2.bytes();
+ assert b2.offset == 0;
+
+
+ // Set newSuffixStart -- we can't use
+ // termEnum's since the above seek may have
+ // done no scanning (eg, term was precisely
+ // and index term, or, was in the term seek
+ // cache):
+ scratchTerm.copy(b2);
+ setNewSuffixStart(prevTerm, scratchTerm);
+
return true;
- } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
- assert pendingPrefix != null;
- assert pendingPrefix.length > seekPrefix;
- pendingPrefix[seekPrefix] = 0xffff;
- Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix));
+ } else if (newSuffixStart != 0 || scratchTerm.length != 0) {
if (DEBUG_SURROGATES) {
- System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
+ System.out.println(" got term=null (or next field)");
}
- getTermsDict().seekEnum(termEnum, t2);
- if (DEBUG_SURROGATES) {
- System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
- }
- surrogateSeekUpto--;
-
- if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
- // force pop
- newSuffixStart = -1;
- } else {
- newSuffixStart = termEnum.newSuffixStart;
- }
-
+ newSuffixStart = 0;
+ scratchTerm.length = 0;
return true;
}
}
@@ -358,117 +482,249 @@
return false;
}
- private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
- private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result();
+ // Pre-flex indices store terms in UTF16 sort order, but
+ // certain queries require Unicode codepoint order; this
+ // method carefully seeks around surrogates to handle
+ // this impedance mismatch
+
+ private void surrogateDance() throws IOException {
+
+ if (!unicodeSortOrder) {
+ return;
+ }
+
+ // We are invoked after TIS.next() (by UTF16 order) to
+ // possibly seek to a different "next" (by unicode
+ // order) term.
+
+ // We scan only the "delta" from the last term to the
+ // current term, in UTF8 bytes. We look at 1) the bytes
+ // stripped from the prior term, and then 2) the bytes
+ // appended to that prior term's prefix.
- private boolean pushNewSurrogate() throws IOException {
+ // We don't care about specific UTF8 sequences, just
+ // the "category" of the UTF16 character. Category S
+ // is a high/low surrogate pair (it non-BMP).
+ // Category E is any BMP char > UNI_SUR_LOW_END (and <
+ // U+FFFF). Category A is the rest (any unicode char
+ // <= UNI_SUR_HIGH_START).
+
+ // The core issue is that pre-flex indices sort the
+ // characters as ASE, while flex must sort as AES. So
+ // when scanning, when we hit S, we must 1) seek
+ // forward to E and enum the terms there, then 2) seek
+ // back to S and enum all terms there, then 3) seek to
+ // after E. Three different seek points (1, 2, 3).
+
+ // We can easily detect S in UTF8: if a byte has
+ // prefix 11110 (0xf0), then that byte and the
+ // following 3 bytes encode a single unicode codepoint
+ // in S. Similary,we can detect E: if a byte has
+ // prefix 1110111 (0xee), then that byte and the
+ // following 2 bytes encode a single unicode codepoint
+ // in E.
+
+ // Note that this is really a recursive process --
+ // maybe the char at pos 2 needs to dance, but any
+ // point in its dance, suddenly pos 4 needs to dance
+ // so you must finish pos 4 before returning to pos
+ // 2. But then during pos 4's dance maybe pos 7 needs
+ // to dance, etc. However, despite being recursive,
+ // we don't need to hold any state because the state
+ // can always be derived by looking at prior term &
+ // current term.
+
+ // TODO: can we avoid this copy?
+ if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
+ scratchTerm.length = 0;
+ } else {
+ scratchTerm.copy(termEnum.term().bytes());
+ }
+
if (DEBUG_SURROGATES) {
- System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
+ System.out.println(" dance");
+ System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
+ System.out.println(" " + prevTerm.toString());
+ System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
+ System.out.println(" " + scratchTerm.toString());
}
- final Term t = termEnum.term();
- if (t == null || t.field() != fieldInfo.name) {
- return false;
+
+ // This code assumes TermInfosReader/SegmentTermEnum
+ // always use BytesRef.offset == 0
+ assert prevTerm.offset == 0;
+ assert scratchTerm.offset == 0;
+
+ // Need to loop here because we may need to do multiple
+ // pops, and possibly a continue in the end, ie:
+ //
+ // cont
+ // pop, cont
+ // pop, pop, cont
+ //
+ //
+
+ while(true) {
+ if (doContinue()) {
+ break;
+ } else {
+ if (!doPop()) {
+ break;
+ }
+ }
}
- final BytesRef bytes = t.bytes();
- UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer);
+ if (DEBUG_SURROGATES) {
+ System.out.println(" finish bmp ends");
+ }
- for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
+ doPushes();
+ }
+
+ // Look for seek type 1 ("push"): if the newly added
+ // suffix contains any S, we must try to seek to the
+ // corresponding E. If we find a match, we go there;
+ // else we keep looking for additional S's in the new
+ // suffix. This "starts" the dance, at this character
+ // position:
+ private void doPushes() throws IOException {
+
+ int upTo = newSuffixStart;
+ if (DEBUG_SURROGATES) {
+ System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
+ }
+
+ while(upTo < scratchTerm.length) {
+ if (isNonBMPChar(scratchTerm.bytes, upTo) &&
+ (upTo > newSuffixStart ||
+ (upTo >= prevTerm.length ||
+ (!isNonBMPChar(prevTerm.bytes, upTo) &&
+ !isHighBMPChar(prevTerm.bytes, upTo))))) {
+
+ // A non-BMP char (4 bytes UTF8) starts here:
+ assert scratchTerm.length >= upTo + 4;
+
+ final int savLength = scratchTerm.length;
+ scratch[0] = scratchTerm.bytes[upTo];
+ scratch[1] = scratchTerm.bytes[upTo+1];
+ scratch[2] = scratchTerm.bytes[upTo+2];
+
+ scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
+ scratchTerm.bytes[upTo+1] = (byte) 0x80;
+ scratchTerm.bytes[upTo+2] = (byte) 0x80;
+ scratchTerm.length = upTo+3;
+
if (DEBUG_SURROGATES) {
- System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
+ System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
}
- // the next() that we just did read in a new
- // suffix, containing a surrogate pair
+ // Seek "forward":
+ // TODO: more efficient seek?
+ getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
- // seek forward to see if there are any terms with
- // this same prefix, but with characters after the
- // surrogate range; if so, we must first iterate
- // them, then seek back to the surrogates
+ scratchTerm.bytes[upTo] = scratch[0];
+ scratchTerm.bytes[upTo+1] = scratch[1];
+ scratchTerm.bytes[upTo+2] = scratch[2];
+ scratchTerm.length = savLength;
- char[] testPrefix = new char[i+2];
- for(int j=0;j= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
+ matches = true;
+ for(int i=0;i BMP
+ upTo += 3;
+
+ // NOTE: we keep iterating, now, since this
+ // can easily "recurse". Ie, after seeking
+ // forward at a certain char position, we may
+ // find another surrogate in our [new] suffix
+ // and must then do another seek (recurse)
} else {
- // there are no terms after the surrogates, so
- // we do nothing to the enum and just step
- // through the surrogates like normal. but we
- // must keep iterating through the term, in case
- // another surrogate pair appears later
+ upTo++;
}
+ } else {
+ upTo++;
}
}
+ }
- return false;
+ private boolean unicodeSortOrder;
+
+ void reset(FieldInfo fieldInfo) throws IOException {
+ //System.out.println("pff.reset te=" + termEnum);
+ this.fieldInfo = fieldInfo;
+ protoTerm = new Term(fieldInfo.name);
+ if (termEnum == null) {
+ termEnum = getTermsDict().terms(protoTerm);
+ seekTermEnum = getTermsDict().terms(protoTerm);
+ //System.out.println(" term=" + termEnum.term());
+ } else {
+ getTermsDict().seekEnum(termEnum, protoTerm);
+ }
+ skipNext = true;
+
+ unicodeSortOrder = sortTermsByUnicode();
+
+ final Term t = termEnum.term();
+ if (t != null && t.field() == fieldInfo.name) {
+ newSuffixStart = 0;
+ prevTerm.length = 0;
+ surrogateDance();
+ }
}
@Override
public Comparator getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
- return BytesRef.getUTF8SortedAsUnicodeComparator();
+ if (unicodeSortOrder) {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ } else {
+ return BytesRef.getUTF8SortedAsUTF16Comparator();
+ }
}
@Override
@@ -484,7 +740,7 @@
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
if (DEBUG_SURROGATES) {
- System.out.println("TE.seek() term=" + term.utf8ToString());
+ System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
skipNext = false;
final TermInfosReader tis = getTermsDict();
@@ -492,50 +748,142 @@
assert termEnum != null;
- if (termEnum == null) {
- termEnum = tis.terms(t0);
- } else {
- tis.seekEnum(termEnum, t0);
- }
+ tis.seekEnum(termEnum, t0);
- surrogateSeekUpto = 0;
- surrogatesDance();
-
final Term t = termEnum.term();
- final BytesRef tr = t == null ? null : t.bytes();
-
- if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) {
- current = tr;
+ if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
+ // If we found an exact match, no need to do the
+ // surrogate dance
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek exact match");
+ }
+ current = t.bytes();
return SeekStatus.FOUND;
} else if (t == null || t.field() != fieldInfo.name) {
+
+ // TODO: maybe we can handle this like the next()
+ // into null? set term as prevTerm then dance?
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek hit EOF");
+ }
+
+ // We hit EOF; try end-case surrogate dance: if we
+ // find an E, try swapping in S, backwards:
+ scratchTerm.copy(term);
+
+ assert scratchTerm.offset == 0;
+
+ for(int i=scratchTerm.length-1;i>=0;i--) {
+ if (isHighBMPChar(scratchTerm.bytes, i)) {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" found E pos=" + i + "; try seek");
+ }
+
+ if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
+
+ scratchTerm.copy(seekTermEnum.term().bytes());
+ getTermsDict().seekEnum(termEnum, seekTermEnum.term());
+
+ newSuffixStart = 1+i;
+
+ doPushes();
+
+ // Found a match
+ // TODO: faster seek?
+ current = termEnum.term().bytes();
+ return SeekStatus.NOT_FOUND;
+ }
+ }
+ }
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek END");
+ }
+
current = null;
return SeekStatus.END;
} else {
- current = tr;
- return SeekStatus.NOT_FOUND;
+
+ // We found a non-exact but non-null term; this one
+ // is fun -- just treat it like next, by pretending
+ // requested term was prev:
+ prevTerm.copy(term);
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
+ }
+
+ final BytesRef br = t.bytes();
+ assert br.offset == 0;
+
+ setNewSuffixStart(term, br);
+
+ surrogateDance();
+
+ final Term t2 = termEnum.term();
+ if (t2 == null || t2.field() != fieldInfo.name) {
+ assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
+ current = null;
+ return SeekStatus.END;
+ } else {
+ current = t2.bytes();
+ assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
+ return SeekStatus.NOT_FOUND;
+ }
}
}
+ private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
+ final int limit = Math.min(br1.length, br2.length);
+ int lastStart = 0;
+ for(int i=0;i getAllExtensions() {
return knownExtensions;
@@ -111,8 +125,5 @@
@Override
public Codec getWriter(SegmentWriteState state) {
return lookup(CodecProvider.getDefaultCodec());
- //return lookup("Pulsing");
- //return lookup("Sep");
- //return lookup("IntBlock");
}
-}
\ No newline at end of file
+}
Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 979430)
+++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy)
@@ -331,12 +331,17 @@
// We know the terms are not equal, but, we may
// have to carefully fixup the bytes at the
// difference to match UTF16's sort order:
+
+ // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff,
+ // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences]
+ // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such
+ // that 6-byte sequences are needed we have much bigger problems anyway.
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
+ aByte += 0xe;
}
if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
+ bByte += 0xe;
}
}
return aByte - bByte;
@@ -346,10 +351,6 @@
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
-
- public boolean equals(Object other) {
- return this == other;
- }
}
public void writeExternal(ObjectOutput out)
Property changes on: lucene\build.xml
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/build.xml:r967125-979432
Property changes on: lucene\contrib
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/contrib:r967125-979432
Property changes on: lucene\contrib\CHANGES.txt
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/contrib/CHANGES.txt:r967125-979432
Property changes on: lucene\contrib\instantiated\src\test\org\apache\lucene\store\instantiated\TestIndicesEquals.java
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java:r967125-979432
Property changes on: lucene\contrib\highlighter\src\test
___________________________________________________________________
Modified: svn:mergeinfo
Merged /lucene/dev/branches/preflexfixes/lucene/contrib/highlighter/src/test:r967125-979432
Index: lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
===================================================================
--- lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (revision 979430)
+++ lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (working copy)
@@ -33,6 +33,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
@@ -107,8 +108,8 @@
RAMDirectory ramdir = new RAMDirectory();
Analyzer analyzer = randomAnalyzer();
- IndexWriter writer = new IndexWriter(ramdir, analyzer,
- IndexWriter.MaxFieldLength.UNLIMITED);
+ IndexWriter writer = new IndexWriter(ramdir,
+ new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED);
Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED);
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (working copy)
@@ -28,7 +28,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@@ -46,8 +45,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
// Add series of docs with specific information for MoreLikeThis
addDoc(writer, "lucene");
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java (working copy)
@@ -20,11 +20,9 @@
import java.io.IOException;
import java.util.HashSet;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
@@ -44,8 +42,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java (working copy)
@@ -19,11 +19,9 @@
import java.util.HashSet;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -53,8 +51,7 @@
public void testMissingTerms() throws Exception {
String fieldName="field1";
RAMDirectory rd=new RAMDirectory();
- RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd);
for (int i = 0; i < 100; i++) {
Document doc=new Document();
int term=i*10; //terms are units of 10;
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java (working copy)
@@ -21,11 +21,9 @@
import java.util.GregorianCalendar;
import java.util.Random;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -63,9 +61,7 @@
super.setUp();
random = newRandom();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
-
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory);
Calendar cal = new GregorianCalendar();
cal.clear();
cal.setTimeInMillis(1041397200000L); // 2003 January 01
@@ -200,8 +196,7 @@
public void testWithCachingFilter() throws Exception {
Directory dir = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir);
IndexReader reader = writer.getReader();
writer.close();
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java (working copy)
@@ -18,13 +18,13 @@
*/
import java.io.IOException;
+import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -38,8 +38,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new IndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false)));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new MockAnalyzer(MockTokenizer.WHITESPACE, false));
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
addDoc(writer, "admin guest", "010", "20040101","Y");
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java (working copy)
@@ -20,10 +20,8 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
@@ -44,8 +42,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
Document doc = new Document();
doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
Index: lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java
===================================================================
--- lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java (revision 979430)
+++ lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java (working copy)
@@ -25,7 +25,6 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
@@ -41,8 +40,7 @@
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory,
- new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory);
//Add series of docs with misspelt names
addDoc(writer, "jonathon smythe","1");