Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (revision 967083) +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (working copy) @@ -19,6 +19,9 @@ import java.io.IOException; import java.util.Random; +import java.util.Collections; +import java.util.List; +import java.util.ArrayList; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -53,7 +56,8 @@ @Override protected void setUp() throws Exception { super.setUp(); - random = newRandom(); + // nocommit seed + random = newRandom(-8827956628610066720L); dir = new MockRAMDirectory(); // TODO: fix mocktokenizer to not extend chartokenizer, so you can have an 'empty' keyword. // currently, this means 'empty tokens' arent created/tested in the enumeration: @@ -64,11 +68,23 @@ Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); - + List terms = new ArrayList(); for (int i = 0; i < 2000*_TestUtil.getRandomMultiplier(); i++) { - field.setValue(_TestUtil.randomUnicodeString(random)); + String s = _TestUtil.randomUnicodeString(random); + field.setValue(s); + terms.add(s); writer.addDocument(doc); } + + if (VERBOSE) { + // utf16 order + Collections.sort(terms); + System.out.println("UTF16 order:"); + for(String s : terms) { + System.out.println(" " + UnicodeUtil.toHexString(s)); + } + } + reader = writer.getReader(); searcher = new IndexSearcher(reader); writer.close(); @@ -122,8 +138,11 @@ /** test a bunch of random regular expressions */ public void testRegexps() throws Exception { - for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) - assertSame(AutomatonTestUtil.randomRegexp(random).toString()); + + for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) { + String reg = AutomatonTestUtil.randomRegexp(random).toString(); + assertSame(reg); + } } /** check that the # of hits is the same as from a very Index: lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -21,6 +21,7 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -64,7 +65,7 @@ public void testPrevTermAtEnd() throws IOException { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); Index: lucene/src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReader.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -1675,7 +1675,7 @@ // LUCENE-1586: getUniqueTermCount public void testUniqueTermCount() throws Exception { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1708,7 +1708,7 @@ // LUCENE-1609: don't load terms index public void testNoTermsIndex() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1725,7 +1725,7 @@ } assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); - writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); writer.addDocument(doc); writer.close(); Index: lucene/src/test/org/apache/lucene/index/TestFlex.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestFlex.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestFlex.java (working copy) @@ -20,6 +20,8 @@ import java.io.*; import java.util.*; import org.apache.lucene.store.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.standard.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; @@ -64,7 +66,8 @@ public void testTermOrd() throws Exception { Directory d = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED)); w.addDocument(doc); Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -4559,7 +4559,7 @@ dir.close(); } - // LUCENE-2095: make sure with multiple threads commit + // LUCENE-2095: make sure with multiple threads commit // doesn't return until all changes are in fact in the // index public void testCommitThreadSafety() throws Throwable { @@ -4673,7 +4673,9 @@ // sort in UTF16 sort order by default public void testTermUTF16SortOrder() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + // nocommit -- allow preflexrw but must force preflex + // for reading + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document d = new Document(); // Single segment Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); @@ -4682,7 +4684,7 @@ Random rnd = newRandom(); final Set allTerms = new HashSet(); - for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) { + for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) { final String s; if (rnd.nextBoolean()) { @@ -4705,14 +4707,13 @@ allTerms.add(s); f.setValue(s); - //System.out.println("add " + termDesc(s)); writer.addDocument(d); if ((1+i) % 42 == 0) { writer.commit(); } } - + IndexReader r = writer.getReader(); // Test each sub-segment Index: lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.intblock.IntBlockCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.index.codecs.pulsing.PulsingCodec; import org.apache.lucene.index.codecs.sep.SepCodec; import org.apache.lucene.index.codecs.standard.StandardCodec; @@ -94,12 +95,24 @@ } public IndexReader getReader() throws IOException { + // nocommit + + /* if (r.nextBoolean()) { return w.getReader(); } else { w.commit(); return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10)); } + */ + + w.commit(); + + return IndexReader.open(w.getDirectory(), + null, + false, + _TestUtil.nextInt(r, 1, 10), + _TestUtil.alwaysCodec(new PreFlexCodec())); } public void close() throws IOException { @@ -119,10 +132,15 @@ RandomCodecProvider(Random random) { register(new StandardCodec()); register(new IntBlockCodec()); - register(new PreFlexCodec()); + // nocommit + //register(new PreFlexCodec()); + register(new PreFlexRWCodec()); register(new PulsingCodec()); register(new SepCodec()); - codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)]; + // nocommit + //codec = + //CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)]; + codec = "PreFlex"; } @Override Index: lucene/src/test/org/apache/lucene/index/TestMultiFields.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestMultiFields.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestMultiFields.java (working copy) @@ -27,12 +27,13 @@ public void testRandom() throws Exception { + Random r = newRandom(); + for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) { Directory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); - Random r = new Random(); - Map> docs = new HashMap>(); Set deleted = new HashSet(); List terms = new ArrayList(); @@ -45,7 +46,7 @@ doc.add(id); boolean onlyUniqueTerms = r.nextBoolean(); - + Set uniqueTerms = new HashSet(); for(int i=0;i 0) { @@ -61,6 +62,7 @@ } docs.get(term).add(i); terms.add(term); + uniqueTerms.add(term); f.setValue(s); } id.setValue(""+i); @@ -75,8 +77,19 @@ } } + if (VERBOSE) { + List termsList = new ArrayList(uniqueTerms); + Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); + System.out.println("UTF16 order:"); + for(BytesRef b : termsList) { + System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); + } + } + + // nocommit IndexReader reader = w.getReader(); w.close(); + //System.out.println("TEST reader=" + reader); Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -493,15 +493,22 @@ // Test random seek by ord: final int idx = TestCodecs.this.nextInt(field.terms.length); term = field.terms[idx]; - status = termsEnum.seek(idx); - assertEquals(status, TermsEnum.SeekStatus.FOUND); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); - assertEquals(term.docs.length, termsEnum.docFreq()); - if (field.omitTF) { - this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); - } else { - this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + try { + status = termsEnum.seek(idx); + } catch (UnsupportedOperationException uoe) { + // ok -- skip it + status = null; } + if (status != null) { + assertEquals(status, TermsEnum.SeekStatus.FOUND); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); + assertEquals(term.docs.length, termsEnum.docFreq()); + if (field.omitTF) { + this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); + } else { + this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + } + } // Test seek to non-existent terms: for(int i=0;i<100;i++) { @@ -520,9 +527,12 @@ // Seek to each term by ord, backwards for(int i=field.terms.length-1;i>=0;i--) { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); - assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + try { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + } catch (UnsupportedOperationException uoe) { + } } // Seek to non-existent empty-string term Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java (working copy) @@ -394,18 +394,18 @@ } public void testDeletesOnDiskFull() throws IOException { - testOperationsOnDiskFull(false); + doTestOperationsOnDiskFull(false); } public void testUpdatesOnDiskFull() throws IOException { - testOperationsOnDiskFull(true); + doTestOperationsOnDiskFull(true); } /** * Make sure if modifier tries to commit but hits disk full that modifier * remains consistent and usable. Similar to TestIndexReader.testDiskFull(). */ - private void testOperationsOnDiskFull(boolean updates) throws IOException { + private void doTestOperationsOnDiskFull(boolean updates) throws IOException { Term searchTerm = new Term("content", "aaa"); int START_COUNT = 157; @@ -700,6 +700,7 @@ try { modifier.commit(); } catch (IOException ioe) { + // expected failed = true; } Index: lucene/src/test/org/apache/lucene/index/TestAddIndexes.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (working copy) @@ -19,7 +19,6 @@ import java.io.IOException; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -139,7 +138,6 @@ setUpDirs(dir, aux); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND)); - writer.addIndexes(new Directory[] {aux}); // Adds 10 docs, then replaces them with another 10 Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (working copy) @@ -1,225 +0,0 @@ -package org.apache.lucene.index.codecs.preflex; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.io.IOException; -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.util.*; - - -/** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - -final class TermInfosWriter { - /** The file format version, a negative number. */ - public static final int FORMAT = -3; - - // Changed strings to true utf8 with length-in-bytes not - // length-in-chars - public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; - - // NOTE: always change this if you switch to a new format! - public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - -} Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 967083) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (working copy) @@ -18,8 +18,11 @@ */ import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; import org.apache.lucene.index.*; import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.util.*; import java.util.*; @@ -30,8 +33,6 @@ public class TestSurrogates extends LuceneTestCaseJ4 { - // chooses from a very limited alphabet to exacerbate the - // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { final int end = r.nextInt(20); if (end == 0) { @@ -44,154 +45,295 @@ if (0 == t && i < end - 1) { // hi - buffer[i++] = (char) 0xd800; + buffer[i++] = (char) (0xd800 + r.nextInt(2)); // lo - buffer[i] = (char) 0xdc00; + buffer[i] = (char) (0xdc00 + r.nextInt(2)); } else if (t <= 3) { - buffer[i] = 'a'; + buffer[i] = (char) ('a' + r.nextInt(2)); } else if (4 == t) { - buffer[i] = 0xe000; + buffer[i] = (char) (0xe000 + r.nextInt(2)); } } return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + private String toHexString(Term t) { + return t.field() + ":" + UnicodeUtil.toHexString(t.text()); + } - final int numField = _TestUtil.nextInt(r, 2, 5); + private String getRandomString(Random r) { + String s; + if (r.nextInt(3) == 1) { + s = makeDifficultRandomUnicodeString(r); + } else { + s = _TestUtil.randomUnicodeString(r); + } + return s; + } - List terms = new ArrayList(); + private static class SortTermAsUTF16Comparator implements Comparator { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + } - int tc = 0; + private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator(); - for(int f=0;f fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException { - fieldInfos.add(field, true, false, false, false, false, false, false); - final int numTerms = 10000*_TestUtil.getRandomMultiplier(); - for(int i=0;i= fieldTerms.size()) { + break; + } + term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } } - w.close(); + } - Collections.sort(fieldTerms); + private void doTestSeekDoesNotExist(Random r, int numField, List fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + if (VERBOSE) { - System.out.println("\nTEST: codepoint order"); - for(Term t: fieldTerms) { - System.out.println(" " + t.field() + ":" + toHexString(t)); - } + System.out.println("TEST: top random seeks"); } - dir.createOutput(segName + ".prx").close(); - dir.createOutput(segName + ".frq").close(); + { + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // seek to random spot + String field = ("f" + r.nextInt(numField)).intern(); + Term tx = new Term(field, getRandomString(r)); - // !!hack alert!! stuffing uniqueTermCount in as docCount - return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec); + int spot = Arrays.binarySearch(fieldTermsArray, tx); + + if (spot < 0) { + if (VERBOSE) { + System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text())); + } + + // term does not exist: + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" got enum"); + } + + spot = -spot - 1; + + if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) { + assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes())); + } else { + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes())); + + if (VERBOSE) { + System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString())); + System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text())); + } + + assertEquals(fieldTerms.get(spot).bytes(), + te.term()); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + Term term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } + + } + } + } + } } - private String toHexString(Term t) { - return t.field() + ":" + UnicodeUtil.toHexString(t.text()); - } - + @Test public void testSurrogatesOrder() throws Exception { + Random r = newRandom(); + Directory dir = new MockRAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter(r, + dir, + new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec()))); - Codec codec = new PreFlexCodec(); + final int numField = _TestUtil.nextInt(r, 2, 5); - Random r = newRandom(); - FieldInfos fieldInfos = new FieldInfos(); + int uniqueTermCount = 0; + + int tc = 0; + List fieldTerms = new ArrayList(); - SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); - // hack alert!! - int uniqueTermCount = si.docCount; + for(int f=0;f uniqueTerms = new HashSet(); + + for(int i=0;i pairs in a + Directory. A TermInfos can be written once, in order. */ + +final class TermInfosWriter { + /** The file format version, a negative number. */ + public static final int FORMAT = -3; + + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + + private FieldInfos fieldInfos; + private IndexOutput output; + private TermInfo lastTi = new TermInfo(); + private long size; + + // TODO: the default values for these two parameters should be settable from + // IndexWriter. However, once that's done, folks will start setting them to + // ridiculous values and complaining that things don't work well, as with + // mergeFactor. So, let's wait until a number of folks find that alternate + // values work better. Note that both of these values are stored in the + // segment, so that it's safe to change these w/o rebuilding all indexes. + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int indexInterval = 128; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + private long lastIndexPointer; + private boolean isIndex; + private final BytesRef lastTerm = new BytesRef(); + private int lastFieldNumber = -1; + + private TermInfosWriter other; + + TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval) + throws IOException { + initialize(directory, segment, fis, interval, false); + other = new TermInfosWriter(directory, segment, fis, interval, true); + other.other = this; + } + + private TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval, boolean isIndex) throws IOException { + initialize(directory, segment, fis, interval, isIndex); + } + + private void initialize(Directory directory, String segment, FieldInfos fis, + int interval, boolean isi) throws IOException { + indexInterval = interval; + fieldInfos = fis; + isIndex = isi; + output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); + output.writeInt(FORMAT_CURRENT); // write format + output.writeLong(0); // leave space for size + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + output.writeInt(maxSkipLevels); // write maxSkipLevels + assert initUTF16Results(); + } + + // Currently used only by assert statements + UnicodeUtil.UTF16Result utf16Result1; + UnicodeUtil.UTF16Result utf16Result2; + private final BytesRef scratchBytes = new BytesRef(); + + // Currently used only by assert statements + private boolean initUTF16Results() { + utf16Result1 = new UnicodeUtil.UTF16Result(); + utf16Result2 = new UnicodeUtil.UTF16Result(); + return true; + } + + // Currently used only by assert statement + private int compareToLastTerm(int fieldNumber, BytesRef term) { + + if (lastFieldNumber != fieldNumber) { + final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); + // If there is a field named "" (empty string) then we + // will get 0 on this comparison, yet, it's "OK". But + // it's not OK if two different field numbers map to + // the same name. + if (cmp != 0 || lastFieldNumber != -1) + return cmp; + } + + scratchBytes.copy(term); + assert lastTerm.offset == 0; + UnicodeUtil.UTF8toUTF16(lastTerm.bytes, 0, lastTerm.length, utf16Result1); + + assert scratchBytes.offset == 0; + UnicodeUtil.UTF8toUTF16(scratchBytes.bytes, 0, scratchBytes.length, utf16Result2); + + final int len; + if (utf16Result1.length < utf16Result2.length) + len = utf16Result1.length; + else + len = utf16Result2.length; + + for(int i=0;i, TermInfo> pair to the set. + Term must be lexicographically greater than all previous Terms added. + TermInfo pointers must be positive and greater than all previous.*/ + public void add(int fieldNumber, BytesRef term, TermInfo ti) + throws IOException { + + assert compareToLastTerm(fieldNumber, term) < 0 || + (isIndex && term.length == 0 && lastTerm.length == 0) : + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + + " text=" + term.utf8ToString() + " lastText=" + lastTerm.utf8ToString(); + + assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; + assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; + + if (!isIndex && size % indexInterval == 0) + other.add(lastFieldNumber, lastTerm, lastTi); // add an index term + + writeTerm(fieldNumber, term); // write term + + output.writeVInt(ti.docFreq); // write doc freq + output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers + output.writeVLong(ti.proxPointer - lastTi.proxPointer); + + if (ti.docFreq >= skipInterval) { + output.writeVInt(ti.skipOffset); + } + + if (isIndex) { + output.writeVLong(other.output.getFilePointer() - lastIndexPointer); + lastIndexPointer = other.output.getFilePointer(); // write pointer + } + + lastFieldNumber = fieldNumber; + lastTi.set(ti); + size++; + } + + private void writeTerm(int fieldNumber, BytesRef term) + throws IOException { + + //System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString()); + + // TODO: UTF16toUTF8 could tell us this prefix + // Compute prefix in common with last term: + int start = 0; + final int limit = term.length < lastTerm.length ? term.length : lastTerm.length; + while(start < limit) { + if (term.bytes[start+term.offset] != lastTerm.bytes[start+lastTerm.offset]) + break; + start++; + } + + final int length = term.length - start; + output.writeVInt(start); // write shared prefix length + output.writeVInt(length); // write delta length + output.writeBytes(term.bytes, start+term.offset, length); // write delta bytes + output.writeVInt(fieldNumber); // write field num + lastTerm.copy(term); + } + + /** Called to complete TermInfos creation. */ + void close() throws IOException { + output.seek(4); // write size after format + output.writeLong(size); + output.close(); + + if (!isIndex) + other.close(); + } + +} Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java (revision 0) +++ lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java (revision 0) @@ -0,0 +1,212 @@ +package org.apache.lucene.index.codecs.preflexrw; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.preflex.TermInfo; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.util.Comparator; + +class PreFlexFieldsWriter extends FieldsConsumer { + + private final TermInfosWriter termsOut; + private final IndexOutput freqOut; + private final IndexOutput proxOut; + private final DefaultSkipListWriter skipListWriter; + private final int totalNumDocs; + + public PreFlexFieldsWriter(SegmentWriteState state) throws IOException { + termsOut = new TermInfosWriter(state.directory, + state.segmentName, + state.fieldInfos, + state.termIndexInterval); + state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_EXTENSION)); + state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); + + final String freqFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.FREQ_EXTENSION); + freqOut = state.directory.createOutput(freqFile); + state.flushedFiles.add(freqFile); + totalNumDocs = state.numDocs; + + if (state.fieldInfos.hasProx()) { + final String proxFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.PROX_EXTENSION); + proxOut = state.directory.createOutput(proxFile); + state.flushedFiles.add(proxFile); + } else { + proxOut = null; + } + + skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, + termsOut.maxSkipLevels, + totalNumDocs, + freqOut, + proxOut); + //System.out.println("\nw start seg=" + segment); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + assert field.number != -1; + //System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number); + return new PreFlexTermsWriter(field); + } + + @Override + public void close() throws IOException { + termsOut.close(); + freqOut.close(); + if (proxOut != null) { + proxOut.close(); + } + } + + private class PreFlexTermsWriter extends TermsConsumer { + private final FieldInfo fieldInfo; + private final boolean omitTF; + private final boolean storePayloads; + + private final TermInfo termInfo = new TermInfo(); + private final PostingsWriter postingsWriter = new PostingsWriter(); + + public PreFlexTermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + private class PostingsWriter extends PostingsConsumer { + private int lastDocID; + private int lastPayloadLength = -1; + private int lastPosition; + private int df; + + public PostingsWriter reset() { + df = 0; + lastDocID = 0; + lastPayloadLength = -1; + return this; + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + //System.out.println(" w doc=" + docID); + + final int delta = docID - lastDocID; + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % termsOut.skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + lastDocID = docID; + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + if (omitTF) { + freqOut.writeVInt(delta); + } else { + final int code = delta << 1; + if (termDocFreq == 1) { + freqOut.writeVInt(code|1); + } else { + freqOut.writeVInt(code); + freqOut.writeVInt(termDocFreq); + } + } + lastPosition = 0; + } + + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert proxOut != null; + + //System.out.println(" w pos=" + position + " payl=" + payload); + final int delta = position - lastPosition; + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + if (payloadLength != lastPayloadLength) { + //System.out.println(" write payload len=" + payloadLength); + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payload.length); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() throws IOException { + } + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + //System.out.println(" w term=" + text.utf8ToString()); + skipListWriter.resetSkip(); + termInfo.freqPointer = freqOut.getFilePointer(); + if (proxOut != null) { + termInfo.proxPointer = proxOut.getFilePointer(); + } + return postingsWriter.reset(); + } + + @Override + public void finishTerm(BytesRef text, int numDocs) throws IOException { + if (numDocs > 0) { + long skipPointer = skipListWriter.writeSkip(freqOut); + termInfo.docFreq = numDocs; + termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer); + //System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number); + termsOut.add(fieldInfo.number, + text, + termInfo); + } + } + + @Override + public void finish() throws IOException { + } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + } +} \ No newline at end of file Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java (revision 0) +++ lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.index.codecs.preflexrw; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.codecs.preflex.PreFlexFields; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; + +/** Codec, only for testing, that can write and read the + * pre-flex index format. + * + * @lucene.experimental + */ +public class PreFlexRWCodec extends PreFlexCodec { + + public PreFlexRWCodec() { + // NOTE: we use same name as core PreFlex codec so that + // it can read the segments we write! + super(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new PreFlexFieldsWriter(state); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + //System.out.println("preflexrw"); + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, false); + } +} Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java =================================================================== --- lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java (revision 967083) +++ lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java (working copy) @@ -22,6 +22,9 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; + import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -405,4 +408,9 @@ private String name = ""; + // nocommit -- not sure this is sufficient? + static { + CodecProvider.getDefault().register(new PreFlexRWCodec()); + CodecProvider.setDefaultCodec("PreFlex"); + } } Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (revision 967083) +++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -34,6 +34,8 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; /** * Base class for all Lucene unit tests. @@ -298,4 +300,11 @@ // static members private static final Random seedRnd = new Random(); + + // nocommit + static { + CodecProvider.getDefault().register(new PreFlexRWCodec()); + CodecProvider.setDefaultCodec("PreFlex"); + } + } Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 967083) +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -23,6 +23,9 @@ import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.Directory; import java.io.ByteArrayOutputStream; import java.io.PrintStream; @@ -129,9 +132,25 @@ } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + //buffer[i] = (char) (97 + r.nextInt(26)); + + /* if (0 == t && i < end - 1) { + // hi + buffer[i++] = (char) 0xd800; + // lo + buffer[i] = (char) 0xdc00; + } else if (t <= 3) { + buffer[i] = 'a'; + } else if (4 == t) { + buffer[i] = 0xe000; + } + */ + + if (0 == t && i < end - 1) { // Make a surrogate pair // High surrogate buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); @@ -218,4 +237,22 @@ public static int getRandomMultiplier() { return Integer.parseInt(System.getProperty("random.multiplier", "1")); } + + public static CodecProvider alwaysCodec(final Codec c) { + return new CodecProvider() { + @Override + public Codec getWriter(SegmentWriteState state) { + return c; + } + + @Override + public Codec lookup(String name) { + return c; + } + }; + } + + public static CodecProvider alwaysCodec(final String codec) { + return alwaysCodec(CodecProvider.getDefault().lookup(codec)); + } } Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 967083) +++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -792,6 +792,7 @@ throws IOException { String field = StringHelper.intern(entryKey.field); + Terms terms = MultiFields.getTerms(reader, field); final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue(); Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy) @@ -116,7 +116,7 @@ // different TermComps final Comparator subTermComp = termsEnumIndex.termsEnum.getComparator(); if (subTermComp != null && !subTermComp.equals(termComp)) { - throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge"); } } Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1095,7 +1095,7 @@ continue; } assert checkDeleteTerm(term); - + if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); Index: lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.index.FieldsEnum; import java.io.IOException; +import java.io.Closeable; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -30,7 +31,7 @@ * * @lucene.experimental */ -public abstract class FieldsConsumer { +public abstract class FieldsConsumer implements Closeable { /** Add a new field */ public abstract TermsConsumer addField(FieldInfo field) throws IOException; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy) @@ -132,18 +132,21 @@ position = p; termBuffer.set(t); prevBuffer.reset(); + //System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this); termInfo.set(ti); } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { + prevBuffer.set(termBuffer); + //System.out.println(" ste setPrev=" + prev() + " this=" + this); + if (position++ >= size - 1) { - prevBuffer.set(termBuffer); termBuffer.reset(); + //System.out.println(" EOF"); return false; } - prevBuffer.set(termBuffer); termBuffer.read(input, fieldInfos); newSuffixStart = termBuffer.newSuffixStart; @@ -168,6 +171,7 @@ if (isIndex) indexPointer += input.readVLong(); // read index pointer + //System.out.println(" ste ret term=" + term()); return true; } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy) @@ -18,9 +18,10 @@ */ import java.io.IOException; +import java.util.Comparator; + import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.FieldInfos; @@ -28,102 +29,65 @@ private String field; private Term term; // cached - private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) - private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); private BytesRef bytes = new BytesRef(10); - int newSuffixStart; + private static final Comparator utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); - public final int compareTo(TermBuffer other) { + int newSuffixStart; // only valid right after .read is called + + public int compareTo(TermBuffer other) { if (field == other.field) // fields are interned - return compareChars(text.result, text.length, other.text.result, other.text.length); + return utf8AsUTF16Comparator.compare(bytes, other.bytes); else return field.compareTo(other.field); } - private static int compareChars(char[] chars1, int len1, - char[] chars2, int len2) { - final int end = len1 < len2 ? len1:len2; - for (int k = 0; k < end; k++) { - char c1 = chars1[k]; - char c2 = chars2[k]; - if (c1 != c2) { - return c1 - c2; - } - } - return len1 - len2; - } - - public final void read(IndexInput input, FieldInfos fieldInfos) + public void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache - int start = input.readVInt(); + newSuffixStart = input.readVInt(); int length = input.readVInt(); - int totalLength = start + length; + int totalLength = newSuffixStart + length; if (bytes.bytes.length < totalLength) { bytes.grow(totalLength); } - if (dirty) { - // Fully convert all bytes since bytes is dirty - UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); - dirty = false; - } else { - // Incrementally convert only the UTF8 bytes that are new: - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); - } - - while(true) { - newSuffixStart = text.offsets[start]; - if (newSuffixStart != -1) { - break; - } - if (--start == 0) { - newSuffixStart = 0; - break; - } - } + bytes.length = totalLength; + input.readBytes(bytes.bytes, newSuffixStart, length); this.field = fieldInfos.fieldName(input.readVInt()); } - public final void set(Term term) { + public void set(Term term) { if (term == null) { reset(); return; } - - final BytesRef termBytes = term.bytes(); - UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text); - dirty = true; + bytes.copy(term.bytes()); field = term.field(); this.term = term; } - public final void set(TermBuffer other) { - text.copyText(other.text); - dirty = true; + public void set(TermBuffer other) { field = other.field; - term = other.term; + // nocommit -- right? + //term = other.term; + term = null; + bytes.copy(other.bytes); } public void reset() { field = null; - text.setLength(0); term = null; - dirty = true; } public Term toTerm() { if (field == null) // unset return null; - if (term == null) - term = new Term(field, new BytesRef(text.result, 0, text.length), false); + if (term == null) { + term = new Term(field, new BytesRef(bytes), false); + //term = new Term(field, bytes, false); + } return term; } @@ -134,12 +98,7 @@ try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} - clone.dirty = true; - clone.bytes = new BytesRef(10); - clone.text = new UnicodeUtil.UTF16Result(); - clone.text.offsets = new int[text.offsets.length]; - System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length); - clone.text.copyText(text); + clone.bytes = new BytesRef(bytes); return clone; } } Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (working copy) @@ -40,16 +40,16 @@ public class PreFlexCodec extends Codec { /** Extension of terms file */ - static final String TERMS_EXTENSION = "tis"; + public static final String TERMS_EXTENSION = "tis"; /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tii"; + public static final String TERMS_INDEX_EXTENSION = "tii"; /** Extension of freq postings file */ - static final String FREQ_EXTENSION = "frq"; + public static final String FREQ_EXTENSION = "frq"; /** Extension of prox postings file */ - static final String PROX_EXTENSION = "prx"; + public static final String PROX_EXTENSION = "prx"; public PreFlexCodec() { name = "PreFlex"; @@ -62,7 +62,7 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor); + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, true); } @Override Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (working copy) @@ -23,30 +23,30 @@ * indexing. */ @Deprecated -class TermInfo { +public class TermInfo { /** The number of documents which contain the term. */ - int docFreq = 0; + public int docFreq = 0; - long freqPointer = 0; - long proxPointer = 0; - int skipOffset; + public long freqPointer = 0; + public long proxPointer = 0; + public int skipOffset; - TermInfo() {} + public TermInfo() {} - TermInfo(int df, long fp, long pp) { + public TermInfo(int df, long fp, long pp) { docFreq = df; freqPointer = fp; proxPointer = pp; } - TermInfo(TermInfo ti) { + public TermInfo(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; skipOffset = ti.skipOffset; } - final void set(int docFreq, + public final void set(int docFreq, long freqPointer, long proxPointer, int skipOffset) { this.docFreq = docFreq; this.freqPointer = freqPointer; @@ -54,7 +54,7 @@ this.skipOffset = skipOffset; } - final void set(TermInfo ti) { + public final void set(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (working copy) @@ -119,9 +119,12 @@ indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { + + for (int i=0;indexEnum.next(); i++) { indexTerms[i] = indexEnum.term(); + assert indexTerms[i] != null; + assert indexTerms[i].text() != null; + assert indexTerms[i].field() != null; indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; @@ -160,14 +163,14 @@ return origEnum.maxSkipLevels; } - final void close() throws IOException { + void close() throws IOException { if (origEnum != null) origEnum.close(); threadResources.close(); } /** Returns the number of term/value pairs in the set. */ - final long size() { + long size() { return size; } @@ -183,12 +186,13 @@ /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { + private int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexTerms.length - 1; while (hi >= lo) { int mid = (lo + hi) >>> 1; + assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid; int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; @@ -200,7 +204,7 @@ return hi; } - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); @@ -231,6 +235,9 @@ } TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { + if (size == 0) { + return null; + } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current @@ -242,7 +249,6 @@ // no need to seek final TermInfo ti; - int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); @@ -279,6 +285,7 @@ seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { @@ -294,7 +301,7 @@ } // called only from asserts - private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { + private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.docFreq != ti2.docFreq) { return false; } @@ -319,7 +326,7 @@ } /** Returns the position of a Term in the set or -1. */ - final long getPosition(Term term) throws IOException { + long getPosition(Term term) throws IOException { if (size == 0) return -1; ensureIndexIsRead(); Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 967083) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -40,12 +40,11 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { - + private static final boolean DEBUG_SURROGATES = false; public TermInfosReader tis; @@ -59,11 +58,19 @@ private final Directory dir; private final int readBufferSize; private Directory cfsReader; + private final boolean unicodeSortOrder; - PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) + // If unicodeSortOrder is true, we do the surrogates dance + // so that the terms are sorted by unicode sort order. + // This should be true when segments are used for "normal" + // searching; it's only false during testing, to create a + // pre-flex index, using the preflexrw codec under + // src/test. + public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor, boolean unicodeSortOrder) throws IOException { si = info; + this.unicodeSortOrder = unicodeSortOrder; // NOTE: we must always load terms index, even for // "sequential" scan during merging, because what is @@ -182,6 +189,12 @@ if (cfsReader != null) { cfsReader.close(); } + if (freqStream != null) { + freqStream.close(); + } + if (proxStream != null) { + proxStream.close(); + } } private class PreFlexFieldsEnum extends FieldsEnum { @@ -228,7 +241,11 @@ public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } @@ -238,119 +255,227 @@ private boolean skipNext; private BytesRef current; - private int[] surrogateSeekPending = new int[1]; - private boolean[] surrogateDidSeekBack = new boolean[1]; - private int surrogateSeekUpto; - private char[] pendingPrefix; - private SegmentTermEnum seekTermEnum; private Term protoTerm; + + private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0; + private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee; + + // Returns true if the unicode char is "after" the + // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: + private final boolean isHighBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; + } + + // Returns true if the unicode char in the UTF8 byte + // sequence starting at idx encodes a char outside of + // BMP (ie what would be a surrogate pair in UTF16): + private final boolean isNonBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; + } + + private final byte[] scratch = new byte[4]; + private final BytesRef prevTerm = new BytesRef(); + private final BytesRef scratchTerm = new BytesRef(); private int newSuffixStart; - void reset(FieldInfo fieldInfo) throws IOException { - this.fieldInfo = fieldInfo; - protoTerm = new Term(fieldInfo.name); - if (termEnum == null) { - termEnum = getTermsDict().terms(protoTerm); - seekTermEnum = getTermsDict().terms(protoTerm); + // Swap in S, in place of E: + private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException { + final int savLength = term.length; + + assert term.offset == 0; + + // The 3 bytes starting at downTo make up 1 + // unicode character: + assert isHighBMPChar(term.bytes, pos); + + // nocommit -- why does this trip? + // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3); + + // Save the bytes && length, since we need to + // restore this if seek "back" finds no matching + // terms + if (term.bytes.length < 4+pos) { + term.grow(4+pos); + } + + scratch[0] = term.bytes[pos]; + scratch[1] = term.bytes[pos+1]; + scratch[2] = term.bytes[pos+2]; + + term.bytes[pos] = (byte) 0xf0; + term.bytes[pos+1] = (byte) 0x90; + term.bytes[pos+2] = (byte) 0x80; + term.bytes[pos+3] = (byte) 0x80; + term.length = 4+pos; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString())); + } + + // Seek "back": + getTermsDict().seekEnum(te, protoTerm.createTerm(term)); + + // Test if the term we seek'd to in fact found a + // surrogate pair at the same position as the E: + Term t2 = te.term(); + + // Cannot be null (or move to next field) because at + // "worst" it'd seek to the same term we are on now, + // unless we are being called from seek + if (t2 == null || t2.field() != fieldInfo.name) { + return false; + } + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text())); + } + + // Now test if prefix is identical and we found + // a non-BMP char at the same position: + BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + boolean matches; + if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) { + matches = true; + for(int i=0;i 0) { - sb.append(' '); + boolean didSeek = false; + + final int limit = Math.min(newSuffixStart, scratchTerm.length-1); + + while(downTo > limit) { + + if (isHighBMPChar(prevTerm.bytes, downTo)) { + + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length); } - sb.append(surrogateSeekPending[i]); + + if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { + // TODO: more efficient seek? + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + //newSuffixStart = downTo+4; + newSuffixStart = downTo; + scratchTerm.copy(termEnum.term().bytes()); + didSeek = true; + if (DEBUG_SURROGATES) { + System.out.println(" seek!"); + } + break; + } else { + if (DEBUG_SURROGATES) { + System.out.println(" no seek"); + } + } } - sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); - return sb.toString(); + + // Shorten prevTerm in place so that we don't redo + // this loop if we come back here: + if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) { + prevTerm.length = downTo; + } + + downTo--; } + + return didSeek; } - private boolean popPendingSeek() throws IOException { + // Look for seek type 3 ("pop"): if the delta from + // prev -> current was replacing an S with an E, + // we must now seek to beyond that E. This seek + // "finishes" the dance at this character + // position. + private boolean doPop() throws IOException { + if (DEBUG_SURROGATES) { - System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); + System.out.println(" try pop"); } - // if a .next() has advanced beyond the - // after-surrogates range we had last seeked to, we - // must seek back to the start and resume .next from - // there. this pops the pending seek off the stack. - final Term t = termEnum.term(); - if (surrogateSeekUpto > 0) { - final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; + + assert newSuffixStart <= prevTerm.length; + assert newSuffixStart < scratchTerm.length || newSuffixStart == 0; + + if (prevTerm.length > newSuffixStart && + isNonBMPChar(prevTerm.bytes, newSuffixStart) && + isHighBMPChar(scratchTerm.bytes, newSuffixStart)) { + + // Seek type 2 -- put U+FFFF at this position: + // nocommit -- can we somehow use 0xff??? + scratchTerm.bytes[newSuffixStart] = (byte) 0xff; + //scratchTerm.bytes[newSuffixStart] = (byte) 0xef; + scratchTerm.bytes[newSuffixStart+1] = (byte) 0xbf; + scratchTerm.bytes[newSuffixStart+2] = (byte) 0xbf; + scratchTerm.length = newSuffixStart+3; + if (DEBUG_SURROGATES) { - System.out.println(" seekPrefix=" + seekPrefix); + System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString()); } - if (newSuffixStart < seekPrefix) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); + + // TODO: more efficient seek? can we simply swap + // the enums? + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); + + final Term t2 = termEnum.term(); + + // We could hit EOF or different field since this + // was a seek "forward": + if (t2 != null && t2.field() == fieldInfo.name) { + if (DEBUG_SURROGATES) { - System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes()); } - getTermsDict().seekEnum(termEnum, t2); - surrogateDidSeekBack[surrogateSeekUpto-1] = true; - // +2 because we don't want to re-check the - // surrogates we just seek'd back to - newSuffixStart = seekPrefix + 2; + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + + // Set newSuffixStart -- we can't use + // termEnum's since the above seek may have + // done no scanning (eg, term was precisely + // and index term, or, was in the term seek + // cache): + scratchTerm.copy(b2); + setNewSuffixStart(prevTerm, scratchTerm); + return true; - } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); + } else if (newSuffixStart != 0 || scratchTerm.length != 0) { if (DEBUG_SURROGATES) { - System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); + System.out.println(" got term=null (or next field)"); } - getTermsDict().seekEnum(termEnum, t2); - if (DEBUG_SURROGATES) { - System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - surrogateSeekUpto--; - - if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { - // force pop - newSuffixStart = -1; - } else { - newSuffixStart = termEnum.newSuffixStart; - } - + newSuffixStart = 0; + scratchTerm.length = 0; return true; } } @@ -358,117 +483,245 @@ return false; } - private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); + // Pre-flex indices store terms in UTF16 sort order, but + // certain queries require Unicode codepoint order; this + // method carefully seeks around surrogates to handle + // this impedance mismatch + + private void surrogateDance() throws IOException { + + if (!unicodeSortOrder) { + return; + } + + // We are invoked after TIS.next() (by UTF16 order) to + // possibly seek to a different "next" (by unicode + // order) term. + + // We scan only the "delta" from the last term to the + // current term, in UTF8 bytes. We look at 1) the bytes + // stripped from the prior term, and then 2) the bytes + // appended to that prior term's prefix. - private boolean pushNewSurrogate() throws IOException { + // We don't care about specific UTF8 sequences, just + // the "category" of the UTF16 character. Category S + // is a high/low surrogate pair (it non-BMP). + // Category E is any BMP char > UNI_SUR_LOW_END (and < + // U+FFFF). Category A is the rest (any unicode char + // <= UNI_SUR_HIGH_START). + + // The core issue is that pre-flex indices sort the + // characters as ASE, while flex must sort as AES. So + // when scanning, when we hit S, we must 1) seek + // forward to E and enum the terms there, then 2) seek + // back to S and enum all terms there, then 3) seek to + // after E. Three different seek points (1, 2, 3). + + // We can easily detect S in UTF8: if a byte has + // prefix 11110 (0xf0), then that byte and the + // following 3 bytes encode a single unicode codepoint + // in S. Similary,we can detect E: if a byte has + // prefix 1110111 (0xee), then that byte and the + // following 2 bytes encode a single unicode codepoint + // in E. + + // Note that this is really a recursive process -- + // maybe the char at pos 2 needs to dance, but any + // point in its dance, suddenly pos 4 needs to dance + // so you must finish pos 4 before returning to pos + // 2. But then during pos 4's dance maybe pos 7 needs + // to dance, etc. However, despite being recursive, + // we don't need to hold any state because the state + // can always be derived by looking at prior term & + // current term. + + // TODO: can we avoid this copy? + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + scratchTerm.length = 0; + } else { + scratchTerm.copy(termEnum.term().bytes()); + } + if (DEBUG_SURROGATES) { - System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); + System.out.println(" dance"); + System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString())); + System.out.println(" " + prevTerm.toString()); + System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString())); + System.out.println(" " + scratchTerm.toString()); } - final Term t = termEnum.term(); - if (t == null || t.field() != fieldInfo.name) { - return false; + + // This code assumes TermInfosReader/SegmentTermEnum + // always use BytesRef.offset == 0 + assert prevTerm.offset == 0; + assert scratchTerm.offset == 0; + + // Need to loop here because we may need to do multiple + // pops, and possibly a continue in the end, ie: + // + // cont + // pop, cont + // pop, pop, cont + // + // + + while(true) { + if (doContinue()) { + break; + } else { + if (!doPop()) { + break; + } + } } - final BytesRef bytes = t.bytes(); - UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer); + if (DEBUG_SURROGATES) { + System.out.println(" finish bmp ends"); + } - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { + doPushes(); + } + + // Look for seek type 1 ("push"): if the newly added + // suffix contains any S, we must try to seek to the + // corresponding E. If we find a match, we go there; + // else we keep looking for additional S's in the new + // suffix. This "starts" the dance, at this character + // position: + private void doPushes() throws IOException { + + int upTo = newSuffixStart; + if (DEBUG_SURROGATES) { + System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length); + } + + while(upTo < scratchTerm.length) { + if (isNonBMPChar(scratchTerm.bytes, upTo) && + (upTo > newSuffixStart || + (upTo >= prevTerm.length || + (!isNonBMPChar(prevTerm.bytes, upTo) && + !isHighBMPChar(prevTerm.bytes, upTo))))) { // nocommit -- we can't cmp to prevTerm if we'd done a seek 3 or seek 2 before? + + // A non-BMP char (4 bytes UTF8) starts here: + assert scratchTerm.length >= upTo + 4; + + final int savLength = scratchTerm.length; + scratch[0] = scratchTerm.bytes[upTo]; + scratch[1] = scratchTerm.bytes[upTo+1]; + scratch[2] = scratchTerm.bytes[upTo+2]; + + scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD; + scratchTerm.bytes[upTo+1] = (byte) 0x80; + scratchTerm.bytes[upTo+2] = (byte) 0x80; + scratchTerm.length = upTo+3; + if (DEBUG_SURROGATES) { - System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); + System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length); } - // the next() that we just did read in a new - // suffix, containing a surrogate pair + // Seek "forward": + // TODO: more efficient seek? + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); - // seek forward to see if there are any terms with - // this same prefix, but with characters after the - // surrogate range; if so, we must first iterate - // them, then seek back to the surrogates + scratchTerm.bytes[upTo] = scratch[0]; + scratchTerm.bytes[upTo+1] = scratch[1]; + scratchTerm.bytes[upTo+2] = scratch[2]; + scratchTerm.length = savLength; - char[] testPrefix = new char[i+2]; - for(int j=0;j= upTo+3 && isHighBMPChar(b2.bytes, upTo)) { + matches = true; + for(int i=0;i BMP + upTo += 3; + + // NOTE: we keep iterating, now, since this + // can easily "recurse". Ie, after seeking + // forward at a certain char position, we may + // find another surrogate in our [new] suffix + // and must then do another seek (recurse) } else { - // there are no terms after the surrogates, so - // we do nothing to the enum and just step - // through the surrogates like normal. but we - // must keep iterating through the term, in case - // another surrogate pair appears later + upTo++; } + } else { + upTo++; } } + } - return false; + void reset(FieldInfo fieldInfo) throws IOException { + //System.out.println("pff.reset te=" + termEnum); + this.fieldInfo = fieldInfo; + protoTerm = new Term(fieldInfo.name); + if (termEnum == null) { + termEnum = getTermsDict().terms(protoTerm); + seekTermEnum = getTermsDict().terms(protoTerm); + //System.out.println(" term=" + termEnum.term()); + } else { + getTermsDict().seekEnum(termEnum, protoTerm); + } + skipNext = true; + + final Term t = termEnum.term(); + if (t != null && t.field() == fieldInfo.name) { + newSuffixStart = 0; + prevTerm.length = 0; + surrogateDance(); + } } @Override public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } @Override @@ -484,7 +737,7 @@ @Override public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { if (DEBUG_SURROGATES) { - System.out.println("TE.seek() term=" + term.utf8ToString()); + System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString())); } skipNext = false; final TermInfosReader tis = getTermsDict(); @@ -492,50 +745,142 @@ assert termEnum != null; - if (termEnum == null) { - termEnum = tis.terms(t0); - } else { - tis.seekEnum(termEnum, t0); - } + tis.seekEnum(termEnum, t0); - surrogateSeekUpto = 0; - surrogatesDance(); - final Term t = termEnum.term(); - final BytesRef tr = t == null ? null : t.bytes(); - - if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) { - current = tr; + if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) { + // If we found an exact match, no need to do the + // surrogate dance + if (DEBUG_SURROGATES) { + System.out.println(" seek exact match"); + } + current = t.bytes(); return SeekStatus.FOUND; } else if (t == null || t.field() != fieldInfo.name) { + + // nocommit -- why can't we handle this like the + // next() into null? set term as prevTerm then dance? + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit EOF"); + } + + // We hit EOF; try end-case surrogate dance: if we + // find an E, try swapping in S, backwards: + scratchTerm.copy(term); + + assert scratchTerm.offset == 0; + + for(int i=scratchTerm.length-1;i>=0;i--) { + if (isHighBMPChar(scratchTerm.bytes, i)) { + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + i + "; try seek"); + } + + if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { + + scratchTerm.copy(seekTermEnum.term().bytes()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + + newSuffixStart = 1+i; + + doPushes(); + + // Found a match + // TODO: faster seek? + current = termEnum.term().bytes(); + return SeekStatus.NOT_FOUND; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" seek END"); + } + current = null; return SeekStatus.END; } else { - current = tr; - return SeekStatus.NOT_FOUND; + + // We found a non-exact but non-null term; this one + // is fun -- just treat it like next, by pretending + // requested term was prev: + prevTerm.copy(term); + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text())); + } + + final BytesRef br = t.bytes(); + assert br.offset == 0; + + setNewSuffixStart(term, br); + + surrogateDance(); + + final Term t2 = termEnum.term(); + if (t2 == null || t2.field() != fieldInfo.name) { + assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned + current = null; + return SeekStatus.END; + } else { + current = t2.bytes(); + assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString()); + return SeekStatus.NOT_FOUND; + } } } + private void setNewSuffixStart(BytesRef br1, BytesRef br2) { + final int limit = Math.min(br1.length, br2.length); + int lastStart = 0; + for(int i=0;i= 0xee && bByte >= 0xee) { if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; + // nocommit + aByte += 0xe; } if ((bByte&0xfe) == 0xee) { - bByte += 0x10; + // nocommit + bByte += 0xe; } } return aByte - bByte; @@ -346,10 +348,6 @@ // One is a prefix of the other, or, they are equal: return a.length - b.length; } - - public boolean equals(Object other) { - return this == other; - } } public void writeExternal(ObjectOutput out) Index: lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java =================================================================== --- lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (revision 967083) +++ lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (working copy) @@ -33,6 +33,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; @@ -107,8 +108,8 @@ RAMDirectory ramdir = new RAMDirectory(); Analyzer analyzer = randomAnalyzer(); - IndexWriter writer = new IndexWriter(ramdir, analyzer, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter(ramdir, + new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED);