Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 1070898) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -496,140 +496,14 @@ } } - public static class MyCodecs extends CodecProvider { - MyCodecs() { - Codec ram = new RAMOnlyCodec(); - register(ram); - setDefaultFieldCodec(ram.name); - } - } - - // copied from PulsingCodec, just changing the terms - // comparator - private static class PulsingReverseTermsCodec extends Codec { - - public PulsingReverseTermsCodec() { - name = "PulsingReverseTerms"; - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docsWriter = new StandardPostingsWriter(state); - - // Terms that have <= freqCutoff number of docs are - // "pulsed" (inlined): - final int freqCutoff = 1; - PostingsWriterBase pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); - - // Terms dict index - TermsIndexWriterBase indexWriter; - boolean success = false; - try { - indexWriter = new FixedGapTermsIndexWriter(state) { - // We sort in reverse unicode order, so, we must - // disable the suffix-stripping opto that - // FixedGapTermsIndexWriter does by default! - @Override - protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) { - return indexedTerm.length; - } - }; - success = true; - } finally { - if (!success) { - pulsingWriter.close(); - } - } - - // Terms dict - success = false; - try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator); - success = true; - return ret; - } finally { - if (!success) { - try { - pulsingWriter.close(); - } finally { - indexWriter.close(); - } - } - } - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - - PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); - PostingsReaderBase pulsingReader = new PulsingPostingsReaderImpl(docsReader); - - // Terms dict index reader - TermsIndexReaderBase indexReader; - - boolean success = false; - try { - indexReader = new FixedGapTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - reverseUnicodeComparator, - state.codecId); - success = true; - } finally { - if (!success) { - pulsingReader.close(); - } - } - - // Terms dict reader - success = false; - try { - FieldsProducer ret = new BlockTermsReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - pulsingReader, - state.readBufferSize, - reverseUnicodeComparator, - StandardCodec.TERMS_CACHE_SIZE, - state.codecId); - success = true; - return ret; - } finally { - if (!success) { - try { - pulsingReader.close(); - } finally { - indexReader.close(); - } - } - } - } - - @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - } - - @Override - public void getExtensions(Set extensions) { - StandardCodec.getStandardExtensions(extensions); - } - } - - // tests storing "id" and "field2" fields as pulsing codec, // whose term sort is backwards unicode code point, and // storing "field1" as a custom entirely-in-RAM codec public void testPerFieldCodec() throws Exception { - CodecProvider provider = new MyCodecs(); - Codec pulsing = new PulsingReverseTermsCodec(); - provider.register(pulsing); + CodecProvider provider = new CoreCodecProvider(); + provider.register(new RAMOnlyCodec()); + provider.setDefaultFieldCodec("RamOnly"); - final int NUM_DOCS = 173; MockDirectoryWrapper dir = newDirectory(); dir.setCheckIndexOnClose(false); // we use a custom codec provider @@ -645,11 +519,11 @@ doc.add(newField("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED)); // uses pulsing codec: Field field2 = newField("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED); - provider.setFieldCodec(field2.name(), pulsing.name); + provider.setFieldCodec(field2.name(), "Pulsing"); doc.add(field2); Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED); - provider.setFieldCodec(idField.name(), pulsing.name); + provider.setFieldCodec(idField.name(), "Pulsing"); doc.add(idField); for(int i=0;i 0) { + // Target's prefix is before the common prefix + // of this block, so we position to start of + // block and return NOT_FOUND: + assert state.termCount == 0; + + final int suffix = termSuffixesReader.readVInt(); + term.length = termBlockPrefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); + return SeekStatus.NOT_FOUND; + } else { + common++; } - //System.out.println(" FOUND"); - return SeekStatus.FOUND; - } else if (cmp > 0) { - //System.out.println(" NOT_FOUND term=" + term.utf8ToString()); - return SeekStatus.NOT_FOUND; + + continue; } - + + // Test every term in this block + while (true) { + state.termCount++; + state.ord++; + + final int suffix = termSuffixesReader.readVInt(); + + // We know the prefix matches, so just compare the new suffix: + final int termLen = termBlockPrefix + suffix; + int bytePos = termSuffixesReader.getPosition(); + + boolean next = false; + final int limit = target.offset + (termLen < target.length ? termLen : target.length); + int targetPos = target.offset + termBlockPrefix; + while(targetPos < limit) { + final int cmp = (termSuffixes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); + if (cmp < 0) { + // Current term is still before the target; + // keep scanning + next = true; + break; + } else if (cmp > 0) { + // Done! Current term is after target. Stop + // here, fill in real term, return NOT_FOUND. + term.length = termBlockPrefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); + //System.out.println(" NOT_FOUND"); + return SeekStatus.NOT_FOUND; + } + } + + if (!next && target.length <= termLen) { + term.length = termBlockPrefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); + + if (target.length == termLen) { + // Done! Exact match. Stop here, fill in + // real term, return FOUND. + //System.out.println(" FOUND"); + + if (useCache) { + // Store in cache + decodeMetaData(); + //System.out.println(" cache! state=" + state); + termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone()); + } + + return SeekStatus.FOUND; + } else { + //System.out.println(" NOT_FOUND"); + return SeekStatus.NOT_FOUND; + } + } + + if (state.termCount == state.blockTermCount) { + // Must pre-fill term for next block's common prefix + term.length = termBlockPrefix + suffix; + if (term.bytes.length < term.length) { + term.grow(term.length); + } + termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix); + break; + } else { + termSuffixesReader.skipBytes(suffix); + } + } + // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: + assert indexIsCurrent; + + if (!nextBlock()) { + //System.out.println(" END"); + indexIsCurrent = false; + return SeekStatus.END; + } + common = 0; } - - indexIsCurrent = false; - //System.out.println(" END"); - return SeekStatus.END; } @Override @@ -515,12 +644,10 @@ decode all metadata up to the current term. */ private BytesRef _next() throws IOException { //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")"); - if (state.termCount == state.blockTermCount) { - if (!nextBlock()) { - //System.out.println(" eof"); - indexIsCurrent = false; - return null; - } + if (state.termCount == state.blockTermCount && !nextBlock()) { + //System.out.println(" eof"); + indexIsCurrent = false; + return null; } // TODO: cutover to something better for these ints! simple64? @@ -689,7 +816,7 @@ } //System.out.println(" termSuffixes len=" + len); in.readBytes(termSuffixes, 0, len); - termSuffixesReader.reset(termSuffixes); + termSuffixesReader.reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = in.readVInt(); @@ -698,7 +825,7 @@ } //System.out.println(" freq bytes len=" + len); in.readBytes(docFreqBytes, 0, len); - freqReader.reset(docFreqBytes); + freqReader.reset(docFreqBytes, 0, len); metaDataUpto = 0; state.termCount = 0; @@ -717,23 +844,32 @@ if (!seekPending) { // lazily catch up on metadata decode: final int limit = state.termCount; + // We must set/incr state.termCount because + // postings impl can look at this state.termCount = metaDataUpto; + // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { - //System.out.println(" decode"); + //System.out.println(" decode mdUpto=" + metaDataUpto); // TODO: we could make "tiers" of metadata, ie, // decode docFreq/totalTF but don't decode postings // metadata; this way caller could get // docFreq/totalTF w/o paying decode cost for // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: state.docFreq = freqReader.readVInt(); + //System.out.println(" dF=" + state.docFreq); if (!fieldInfo.omitTermFreqAndPositions) { state.totalTermFreq = state.docFreq + freqReader.readVLong(); + //System.out.println(" totTF=" + state.totalTermFreq); } + postingsReader.nextTerm(fieldInfo, state); metaDataUpto++; state.termCount++; } - } else { + //} else { //System.out.println(" skip! seekPending"); } } Index: lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java (revision 1070898) +++ lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java (working copy) @@ -150,7 +150,7 @@ success = false; try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; return ret; } finally { @@ -195,7 +195,6 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE, state.codecId); success = true; Index: lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java (revision 1070898) +++ lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java (working copy) @@ -126,7 +126,7 @@ success = false; try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; return ret; } finally { @@ -170,7 +170,6 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE, state.codecId); success = true; Index: lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java (revision 1070898) +++ lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java (working copy) @@ -70,7 +70,7 @@ success = false; try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; return ret; } finally { @@ -114,7 +114,6 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE, state.codecId); success = true; Index: lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java (revision 1070898) +++ lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java (working copy) @@ -205,7 +205,7 @@ success = false; try { - FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter); success = true; return ret; } finally { @@ -306,7 +306,6 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), termsCacheSize, state.codecId); success = true; Index: lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java (revision 1070898) +++ lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java (working copy) @@ -18,23 +18,21 @@ */ import java.io.IOException; -import java.util.Comparator; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; public class AppendingTermsDictWriter extends BlockTermsWriter { final static String CODEC_NAME = "APPENDING_TERMS_DICT"; public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter, - SegmentWriteState state, PostingsWriterBase postingsWriter, - Comparator termComp) throws IOException { - super(indexWriter, state, postingsWriter, termComp); + SegmentWriteState state, PostingsWriterBase postingsWriter) + throws IOException { + super(indexWriter, state, postingsWriter); } @Override Index: lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java (revision 1070898) +++ lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java (working copy) @@ -71,7 +71,7 @@ } success = false; try { - FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter); success = true; return ret; } finally { @@ -111,7 +111,6 @@ state.dir, state.fieldInfos, state.segmentInfo.name, docsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE, state.codecId); success = true; Index: lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java (revision 1070898) +++ lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java (working copy) @@ -18,7 +18,6 @@ */ import java.io.IOException; -import java.util.Comparator; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.codecs.PostingsReaderBase; @@ -27,7 +26,6 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; public class AppendingTermsDictReader extends BlockTermsReader { @@ -35,9 +33,9 @@ public AppendingTermsDictReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize, - Comparator termComp, int termsCacheSize, String codecId) throws IOException { + int termsCacheSize, String codecId) throws IOException { super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize, - termComp, termsCacheSize, codecId); + termsCacheSize, codecId); } @Override