diff -r b896c1f47a25 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java Thu Sep 16 05:41:06 2010 -0400 @@ -26,14 +26,14 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.SimpleTermsIndexReader; import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.index.codecs.AbstractPostingsReader; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.StandardPostingsReaderImpl; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriterImpl; -import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.AbstractTermsIndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -58,7 +58,7 @@ @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); + AbstractPostingsWriter docsWriter = new StandardPostingsWriter(state); boolean success = false; AppendingTermsIndexWriter indexWriter = null; try { @@ -88,8 +88,8 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); - StandardTermsIndexReader indexReader; + AbstractPostingsReader docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize); + AbstractTermsIndexReader indexReader; boolean success = false; try { @@ -128,9 +128,9 @@ @Override public void files(Directory dir, SegmentInfo segmentInfo, Set files) throws IOException { - StandardPostingsReaderImpl.files(dir, segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + StandardPostingsReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override diff -r b896c1f47a25 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -21,20 +21,20 @@ import java.util.Comparator; import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.index.codecs.AbstractPostingsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.AbstractTermsIndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; -public class AppendingTermsDictReader extends StandardTermsDictReader { +public class AppendingTermsDictReader extends PrefixCodedTermsReader { - public AppendingTermsDictReader(StandardTermsIndexReader indexReader, + public AppendingTermsDictReader(AbstractTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, - StandardPostingsReader postingsReader, int readBufferSize, + AbstractPostingsReader postingsReader, int readBufferSize, Comparator termComp, int termsCacheSize) throws IOException { super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize, termComp, termsCacheSize); @@ -43,7 +43,7 @@ @Override protected void readHeader(IndexInput in) throws IOException { CodecUtil.checkHeader(in, AppendingTermsDictWriter.CODEC_NAME, - StandardTermsDictWriter.VERSION_START, StandardTermsDictWriter.VERSION_CURRENT); + PrefixCodedTermsWriter.VERSION_START, PrefixCodedTermsWriter.VERSION_CURRENT); } @Override diff -r b896c1f47a25 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -21,18 +21,18 @@ import java.util.Comparator; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.AbstractTermsIndexWriter; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; -public class AppendingTermsDictWriter extends StandardTermsDictWriter { +public class AppendingTermsDictWriter extends PrefixCodedTermsWriter { final static String CODEC_NAME = "APPENDING_TERMS_DICT"; - public AppendingTermsDictWriter(StandardTermsIndexWriter indexWriter, - SegmentWriteState state, StandardPostingsWriter postingsWriter, + public AppendingTermsDictWriter(AbstractTermsIndexWriter indexWriter, + SegmentWriteState state, AbstractPostingsWriter postingsWriter, Comparator termComp) throws IOException { super(indexWriter, state, postingsWriter, termComp); } diff -r b896c1f47a25 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -21,13 +21,13 @@ import java.util.Comparator; import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.SimpleTermsIndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; -public class AppendingTermsIndexReader extends SimpleStandardTermsIndexReader { +public class AppendingTermsIndexReader extends SimpleTermsIndexReader { public AppendingTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp) diff -r b896c1f47a25 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -20,11 +20,11 @@ import java.io.IOException; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; +import org.apache.lucene.index.codecs.SimpleTermsIndexWriter; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.CodecUtil; -public class AppendingTermsIndexWriter extends SimpleStandardTermsIndexWriter { +public class AppendingTermsIndexWriter extends SimpleTermsIndexWriter { final static String CODEC_NAME = "APPENDING_TERMS_INDEX"; final static int VERSION_START = 0; final static int VERSION_CURRENT = VERSION_START; diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/AbstractPostingsReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/AbstractPostingsReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,57 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; + +import org.apache.lucene.index.codecs.TermState; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; + +/** PrefixCodedTermsDictReader interacts with a single instance + * of this to manage creation of {@link DocsEnum} and + * {@link DocsAndPositionsEnum} instances. It provides an + * IndexInput (termsIn) where this class may read any + * previously stored data that it had written in its + * corresponding {@link AbstractPostingsWriter} at indexing + * time. + * @lucene.experimental */ + +public abstract class AbstractPostingsReader implements Closeable { + + public abstract void init(IndexInput termsIn) throws IOException; + + /** Return a newly created empty TermState */ + public abstract TermState newTermState() throws IOException; + + public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException; + + /** Must fully consume state, since after this call that + * TermState may be reused. */ + public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException; + + /** Must fully consume state, since after this call that + * TermState may be reused. */ + public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; + + public abstract void close() throws IOException; +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/AbstractPostingsWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/AbstractPostingsWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,43 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.PostingsConsumer; + +/** + * @lucene.experimental + */ + +public abstract class AbstractPostingsWriter extends PostingsConsumer implements Closeable { + + public abstract void start(IndexOutput termsOut) throws IOException; + + public abstract void startTerm() throws IOException; + + /** Finishes the current term */ + public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + + public abstract void setField(FieldInfo fieldInfo); + + public abstract void close() throws IOException; +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/AbstractTermsIndexReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/AbstractTermsIndexReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,76 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.util.Collection; + + +// TODO +// - allow for non-regular index intervals? eg with a +// long string of rare terms, you don't need such +// frequent indexing + +/** + * TermsDictReader interacts with an instance of this class + * to manage its terms index. The writer must accept + * indexed terms (many pairs of CharSequence text + long + * fileOffset), and then this reader must be able to + * retrieve the nearest index term to a provided term + * text. + * @lucene.experimental */ + +public abstract class AbstractTermsIndexReader { + + static class TermsIndexResult { + long position; + final BytesRef term = new BytesRef(); + long offset; + }; + + public abstract class FieldReader { + /** Returns position of "largest" index term that's <= + * text. Returned TermsIndexResult may be reused + * across calls. This resets internal state, and + * expects that you'll then scan the file and + * sequentially call isIndexTerm for each term + * encountered. */ + public abstract void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException; + + public abstract void getIndexOffset(long ord, TermsIndexResult result) throws IOException; + + /** Call this sequentially for each term encoutered, + * after calling {@link #getIndexOffset}. */ + public abstract boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) throws IOException; + + /** Finds the next index term, after the specified + * ord. Returns true if one exists. */ + public abstract boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException; + } + + public abstract FieldReader getField(FieldInfo fieldInfo); + + public abstract void loadTermsIndex(int indexDivisor) throws IOException; + + public abstract void close() throws IOException; + + public abstract void getExtensions(Collection extensions); +} \ No newline at end of file diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/AbstractTermsIndexWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/AbstractTermsIndexWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,38 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.BytesRef; +import java.io.IOException; + +/** @lucene.experimental */ +public abstract class AbstractTermsIndexWriter { + + public abstract void setTermsOutput(IndexOutput out); + + public abstract class FieldWriter { + public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; + public abstract void finish() throws IOException; + } + + public abstract FieldWriter addField(FieldInfo fieldInfo); + + public abstract void close() throws IOException; +} \ No newline at end of file diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,48 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +// Handles reading incremental UTF8 encoded terms +final class DeltaBytesReader { + final BytesRef term = new BytesRef(); + final IndexInput in; + + DeltaBytesReader(IndexInput in) { + this.in = in; + term.bytes = new byte[10]; + } + + void reset(BytesRef text) { + term.copy(text); + } + + void read() throws IOException { + final int start = in.readVInt(); + final int suffix = in.readVInt(); + assert start <= term.length: "start=" + start + " length=" + term.length; + final int newLength = start+suffix; + term.grow(newLength); + in.readBytes(term.bytes, start, suffix); + term.length = newLength; + } +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,67 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +final class DeltaBytesWriter { + + private byte[] lastBytes = new byte[10]; + private int lastLength; + final IndexOutput out; + + DeltaBytesWriter(IndexOutput out) { + this.out = out; + } + + void reset() { + lastLength = 0; + } + + void write(BytesRef text) throws IOException { + int start = 0; + int upto = text.offset; + final int length = text.length; + final byte[] bytes = text.bytes; + + final int limit = length < lastLength ? length : lastLength; + while(start < limit) { + if (bytes[upto] != lastBytes[start]) + break; + start++; + upto++; + } + + final int suffix = length - start; + out.writeVInt(start); // prefix + out.writeVInt(suffix); // suffix + out.writeBytes(bytes, upto, suffix); + if (lastBytes.length < length) { + lastBytes = ArrayUtil.grow(lastBytes, length); + } + // TODO: is this copy really necessary? I don't think + // caller actually modifies these bytes, so we can save + // by reference? + System.arraycopy(bytes, upto, lastBytes, start, suffix); + lastLength = length; + } +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,498 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; +import java.util.Collection; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.Comparator; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.DoubleBarrelLRUCache; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +/** Handles a terms dict, but decouples all details of + * doc/freqs/positions reading to an instance of {@link + * AbstractPostingsReader}. This class is reusable for + * codecs that use a different format for + * docs/freqs/positions (though codecs are also free to + * make their own terms dict impl). + * + *

This class also interacts with an instance of {@link + * AbstractTermsIndexReader}, to abstract away the specific + * implementation of the terms dict index. + * @lucene.experimental */ + +public class PrefixCodedTermsReader extends FieldsProducer { + // Open input to the main terms dict file (_X.tis) + private final IndexInput in; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + private final AbstractPostingsReader postingsReader; + + private final TreeMap fields = new TreeMap(); + + // Comparator that orders our terms + private final Comparator termComp; + + // Caches the most recently looked-up field + terms: + private final DoubleBarrelLRUCache termsCache; + + // Reads the terms index + private AbstractTermsIndexReader indexReader; + + // keeps the dirStart offset + protected long dirOffset; + + // Used as key for the terms cache + private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey { + String field; + BytesRef term; + + public FieldAndTerm() { + } + + public FieldAndTerm(FieldAndTerm other) { + field = other.field; + term = new BytesRef(other.term); + } + + @Override + public boolean equals(Object _other) { + FieldAndTerm other = (FieldAndTerm) _other; + return other.field == field && term.bytesEquals(other.term); + } + + @Override + public Object clone() { + return new FieldAndTerm(this); + } + + @Override + public int hashCode() { + return field.hashCode() * 31 + term.hashCode(); + } + } + + public PrefixCodedTermsReader(AbstractTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, AbstractPostingsReader postingsReader, int readBufferSize, + Comparator termComp, int termsCacheSize) + throws IOException { + + this.postingsReader = postingsReader; + termsCache = new DoubleBarrelLRUCache(termsCacheSize); + + this.termComp = termComp; + + in = dir.openInput(IndexFileNames.segmentFileName(segment, "", PrefixCodedTermsWriter.TERMS_EXTENSION), + readBufferSize); + + boolean success = false; + try { + readHeader(in); + + // Have PostingsReader init itself + postingsReader.init(in); + + // Read per-field details + seekDir(in, dirOffset); + + final int numFields = in.readInt(); + + for(int i=0;i= 0; + final long termsStartPointer = in.readLong(); + final AbstractTermsIndexReader.FieldReader fieldIndexReader; + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fieldIndexReader = indexReader.getField(fieldInfo); + if (numTerms > 0) { + assert !fields.containsKey(fieldInfo.name); + fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer)); + } + } + success = true; + } finally { + if (!success) { + in.close(); + } + } + + this.indexReader = indexReader; + } + + protected void readHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(in, PrefixCodedTermsWriter.CODEC_NAME, + PrefixCodedTermsWriter.VERSION_START, PrefixCodedTermsWriter.VERSION_CURRENT); + dirOffset = in.readLong(); + } + + protected void seekDir(IndexInput input, long dirOffset) + throws IOException { + input.seek(dirOffset); + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + indexReader.loadTermsIndex(indexDivisor); + } + + @Override + public void close() throws IOException { + try { + try { + if (indexReader != null) { + indexReader.close(); + } + } finally { + // null so if an app hangs on to us (ie, we are not + // GCable, despite being closed) we still free most + // ram + indexReader = null; + if (in != null) { + in.close(); + } + } + } finally { + try { + if (postingsReader != null) { + postingsReader.close(); + } + } finally { + for(FieldReader field : fields.values()) { + field.close(); + } + } + } + } + + public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", PrefixCodedTermsWriter.TERMS_EXTENSION)); + } + + public static void getExtensions(Collection extensions) { + extensions.add(PrefixCodedTermsWriter.TERMS_EXTENSION); + } + + @Override + public FieldsEnum iterator() { + return new TermFieldsEnum(); + } + + @Override + public Terms terms(String field) throws IOException { + return fields.get(field); + } + + // Iterates through all fields + private class TermFieldsEnum extends FieldsEnum { + final Iterator it; + FieldReader current; + + TermFieldsEnum() { + it = fields.values().iterator(); + } + + @Override + public String next() { + if (it.hasNext()) { + current = it.next(); + return current.fieldInfo.name; + } else { + current = null; + return null; + } + } + + @Override + public TermsEnum terms() throws IOException { + return current.iterator(); + } + } + + private class FieldReader extends Terms implements Closeable { + final long numTerms; + final FieldInfo fieldInfo; + final long termsStartPointer; + final AbstractTermsIndexReader.FieldReader fieldIndexReader; + + FieldReader(AbstractTermsIndexReader.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + this.termsStartPointer = termsStartPointer; + this.fieldIndexReader = fieldIndexReader; + } + + @Override + public Comparator getComparator() { + return termComp; + } + + @Override + public void close() { + super.close(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public long getUniqueTermCount() { + return numTerms; + } + + // Iterates through terms in this field + private class SegmentTermsEnum extends TermsEnum { + private final IndexInput in; + private final DeltaBytesReader bytesReader; + private final TermState state; + private boolean seekPending; + private final AbstractTermsIndexReader.TermsIndexResult indexResult = new AbstractTermsIndexReader.TermsIndexResult(); + private final FieldAndTerm fieldTerm = new FieldAndTerm(); + + SegmentTermsEnum() throws IOException { + in = (IndexInput) PrefixCodedTermsReader.this.in.clone(); + in.seek(termsStartPointer); + bytesReader = new DeltaBytesReader(in); + fieldTerm.field = fieldInfo.name; + state = postingsReader.newTermState(); + state.ord = -1; + } + + @Override + public Comparator getComparator() { + return termComp; + } + + /** Seeks until the first term that's >= the provided + * text; returns SeekStatus.FOUND if the exact term + * is found, SeekStatus.NOT_FOUND if a different term + * was found, SeekStatus.END if we hit EOF */ + @Override + public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { + // Check cache + fieldTerm.term = term; + TermState cachedState; + if (useCache) { + cachedState = termsCache.get(fieldTerm); + if (cachedState != null) { + state.copy(cachedState); + seekPending = true; + bytesReader.term.copy(term); + return SeekStatus.FOUND; + } + } else { + cachedState = null; + } + + boolean doSeek = true; + + if (state.ord != -1) { + // we are positioned + + final int cmp = termComp.compare(bytesReader.term, term); + + if (cmp == 0) { + // already at the requested term + return SeekStatus.FOUND; + } + + if (cmp < 0 && + fieldIndexReader.nextIndexTerm(state.ord, indexResult) && + termComp.compare(indexResult.term, term) > 0) { + // Optimization: requested term is within the + // same index block we are now in; skip seeking + // (but do scanning): + doSeek = false; + } + } + + // Used only for assert: + final long startOrd; + + if (doSeek) { + + // As index to find biggest index term that's <= + // our text: + fieldIndexReader.getIndexOffset(term, indexResult); + + in.seek(indexResult.offset); + seekPending = false; + + // NOTE: the first next() after an index seek is + // wasteful, since it redundantly reads the same + // bytes into the buffer. We could avoid storing + // those bytes in the primary file, but then when + // scanning over an index term we'd have to + // special case it: + bytesReader.reset(indexResult.term); + + state.ord = indexResult.position-1; + assert state.ord >= -1: "ord=" + state.ord + " pos=" + indexResult.position; + + startOrd = indexResult.position; + } else { + startOrd = -1; + } + + // Now scan: + while(next() != null) { + final int cmp = termComp.compare(bytesReader.term, term); + if (cmp == 0) { + + if (doSeek && useCache) { + // Store in cache + FieldAndTerm entryKey = new FieldAndTerm(fieldTerm); + cachedState = (TermState) state.clone(); + // this is fp after current term + cachedState.filePointer = in.getFilePointer(); + termsCache.put(entryKey, cachedState); + } + + return SeekStatus.FOUND; + } else if (cmp > 0) { + return SeekStatus.NOT_FOUND; + } + // The purpose of the terms dict index is to seek + // the enum to the closest index term before the + // term we are looking for. So, we should never + // cross another index term (besides the first + // one) while we are scanning: + assert state.ord == startOrd || !fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true): "state.ord=" + state.ord + " startOrd=" + startOrd + " ir.isIndexTerm=" + fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true) + " state.docFreq=" + state.docFreq; + } + + return SeekStatus.END; + } + + @Override + public SeekStatus seek(long ord) throws IOException { + + // TODO: should we cache term lookup by ord as well...? + + if (ord >= numTerms) { + state.ord = numTerms-1; + return SeekStatus.END; + } + + fieldIndexReader.getIndexOffset(ord, indexResult); + in.seek(indexResult.offset); + seekPending = false; + + // NOTE: the first next() after an index seek is + // wasteful, since it redundantly reads the same + // bytes into the buffer + bytesReader.reset(indexResult.term); + + state.ord = indexResult.position-1; + assert state.ord >= -1: "ord=" + state.ord; + + // Now, scan: + int left = (int) (ord - state.ord); + while(left > 0) { + final BytesRef term = next(); + assert term != null; + left--; + } + + // always found + return SeekStatus.FOUND; + } + + @Override + public BytesRef term() { + return bytesReader.term; + } + + @Override + public long ord() { + return state.ord; + } + + @Override + public BytesRef next() throws IOException { + + if (seekPending) { + seekPending = false; + in.seek(state.filePointer); + } + + if (state.ord >= numTerms-1) { + return null; + } + + bytesReader.read(); + state.docFreq = in.readVInt(); + + // TODO: would be cleaner, but space-wasting, to + // simply record a bit into each index entry as to + // whether it's an index entry or not, rather than + // re-compute that information... or, possibly store + // a "how many terms until next index entry" in each + // index entry, but that'd require some tricky + // lookahead work when writing the index + postingsReader.readTerm(in, + fieldInfo, state, + fieldIndexReader.isIndexTerm(1+state.ord, state.docFreq, false)); + + state.ord++; + + return bytesReader.term; + } + + @Override + public int docFreq() { + return state.docFreq; + } + + @Override + public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { + DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); + assert docsEnum != null; + return docsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } else { + return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse); + } + } + } + } +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,195 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Comparator; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.CodecUtil; + +/** + * Writes terms dict and interacts with docs/positions + * consumers to write the postings files. + * + * The [new] terms dict format is field-centric: each field + * has its own section in the file. Fields are written in + * UTF16 string comparison order. Within each field, each + * term's text is written in UTF16 string comparison order. + * @lucene.experimental + */ + +public class PrefixCodedTermsWriter extends FieldsConsumer { + + final static String CODEC_NAME = "STANDARD_TERMS_DICT"; + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tis"; + + // Initial format + public static final int VERSION_START = 0; + + public static final int VERSION_CURRENT = VERSION_START; + + private final DeltaBytesWriter termWriter; + + protected final IndexOutput out; + final AbstractPostingsWriter postingsWriter; + final FieldInfos fieldInfos; + FieldInfo currentField; + private final AbstractTermsIndexWriter termsIndexWriter; + private final List fields = new ArrayList(); + private final Comparator termComp; + + public PrefixCodedTermsWriter( + AbstractTermsIndexWriter termsIndexWriter, + SegmentWriteState state, + AbstractPostingsWriter postingsWriter, + Comparator termComp) throws IOException + { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, "", TERMS_EXTENSION); + this.termsIndexWriter = termsIndexWriter; + this.termComp = termComp; + out = state.directory.createOutput(termsFileName); + termsIndexWriter.setTermsOutput(out); + state.flushedFiles.add(termsFileName); + + fieldInfos = state.fieldInfos; + writeHeader(out); + termWriter = new DeltaBytesWriter(out); + currentField = null; + this.postingsWriter = postingsWriter; + + postingsWriter.start(out); // have consumer write its format/header + } + + protected void writeHeader(IndexOutput out) throws IOException { + // Count indexed fields up front + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + + out.writeLong(0); // leave space for end index pointer + } + + @Override + public TermsConsumer addField(FieldInfo field) { + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + AbstractTermsIndexWriter.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field); + TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); + fields.add(terms); + return terms; + } + + @Override + public void close() throws IOException { + + try { + final int fieldCount = fields.size(); + + final long dirStart = out.getFilePointer(); + + out.writeInt(fieldCount); + for(int i=0;i getComparator() { + return termComp; + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + postingsWriter.startTerm(); + return postingsWriter; + } + + @Override + public void finishTerm(BytesRef text, int numDocs) throws IOException { + + assert numDocs > 0; + + final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs); + + termWriter.write(text); + out.writeVInt(numDocs); + + postingsWriter.finishTerm(numDocs, isIndexTerm); + numTerms++; + } + + // Finishes all terms in this field + @Override + public void finish() throws IOException { + fieldIndexWriter.finish(); + } + } +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/SimpleTermsIndexReader.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/SimpleTermsIndexReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,465 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.PackedInts; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Collection; +import java.util.Comparator; +import java.io.IOException; + +/** + * Uses a simplistic format to record terms dict index + * information. Limititations: + * + * - Index for all fields is loaded entirely into RAM up + * front + * - Index is stored in RAM using shared byte[] that + * wastefully expand every term. Using FST to share + * common prefix & suffix would save RAM. + * - Index is taken at regular numTerms (every 128 by + * default); might be better to do it by "net docFreqs" + * encountered, so that for spans of low-freq terms we + * take index less often. + * + * A better approach might be something similar to how + * postings are encoded, w/ multi-level skips. Ie, load all + * terms index data into memory, as a single large compactly + * encoded stream (eg delta bytes + delta offset). Index + * that w/ multi-level skipper. Then to look up a term is + * the equivalent binary search, using the skipper instead, + * while data remains compressed in memory. + */ + +import org.apache.lucene.index.IndexFileNames; + +/** @lucene.experimental */ +public class SimpleTermsIndexReader extends AbstractTermsIndexReader { + + // NOTE: long is overkill here, since this number is 128 + // by default and only indexDivisor * 128 if you change + // the indexDivisor at search time. But, we use this in a + // number of places to multiply out the actual ord, and we + // will overflow int during those multiplies. So to avoid + // having to upgrade each multiple to long in multiple + // places (error proned), we use long here: + private long totalIndexInterval; + + private int indexDivisor; + final private int indexInterval; + + // Closed if indexLoaded is true: + final private IndexInput in; + private volatile boolean indexLoaded; + + private final Comparator termComp; + + private final static int PAGED_BYTES_BITS = 15; + + // all fields share this single logical byte[] + private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); + private PagedBytes.Reader termBytesReader; + + final HashMap fields = new HashMap(); + + // start of the field info data + protected long dirOffset; + + public SimpleTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp) + throws IOException { + + this.termComp = termComp; + + IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, "", SimpleTermsIndexWriter.TERMS_INDEX_EXTENSION)); + + boolean success = false; + + try { + + readHeader(in); + indexInterval = in.readInt(); + this.indexDivisor = indexDivisor; + + if (indexDivisor < 0) { + totalIndexInterval = indexInterval; + } else { + // In case terms index gets loaded, later, on demand + totalIndexInterval = indexInterval * indexDivisor; + } + assert totalIndexInterval > 0; + + seekDir(in, dirOffset); + + // Read directory + final int numFields = in.readInt(); + + for(int i=0;i= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment; + if (numIndexTerms > 0) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart)); + } + } + success = true; + } finally { + if (indexDivisor > 0) { + in.close(); + this.in = null; + if (success) { + indexLoaded = true; + } + termBytesReader = termBytes.freeze(true); + } else { + this.in = in; + } + } + } + + protected void readHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, SimpleTermsIndexWriter.CODEC_NAME, + SimpleTermsIndexWriter.VERSION_START, SimpleTermsIndexWriter.VERSION_START); + dirOffset = input.readLong(); + } + + private final class FieldIndexReader extends FieldReader { + + final private FieldInfo fieldInfo; + + private volatile CoreFieldIndex coreIndex; + + private final IndexInput in; + + private final long indexStart; + private final long termsStart; + private final long packedIndexStart; + private final long packedOffsetsStart; + + private final int numIndexTerms; + + public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart, + long packedOffsetsStart) throws IOException { + + this.fieldInfo = fieldInfo; + this.in = in; + this.termsStart = termsStart; + this.indexStart = indexStart; + this.packedIndexStart = packedIndexStart; + this.packedOffsetsStart = packedOffsetsStart; + this.numIndexTerms = numIndexTerms; + + // We still create the indexReader when indexDivisor + // is -1, so that PrefixCodedTermsDictReader can call + // isIndexTerm for each field: + if (indexDivisor > 0) { + coreIndex = new CoreFieldIndex(indexStart, + termsStart, + packedIndexStart, + packedOffsetsStart, + numIndexTerms); + + } + } + + public void loadTermsIndex() throws IOException { + if (coreIndex == null) { + coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms); + } + } + + @Override + public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) { + if (onlyLoaded) { + return ord % totalIndexInterval == 0; + } else { + return ord % indexInterval == 0; + } + } + + @Override + public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException { + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } else { + return coreIndex.nextIndexTerm(ord, result); + } + } + + @Override + public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException { + // You must call loadTermsIndex if you had specified -1 for indexDivisor + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } + coreIndex.getIndexOffset(term, result); + } + + @Override + public void getIndexOffset(long ord, TermsIndexResult result) throws IOException { + // You must call loadTermsIndex if you had specified + // indexDivisor < 0 to ctor + if (coreIndex == null) { + throw new IllegalStateException("terms index was not loaded"); + } + coreIndex.getIndexOffset(ord, result); + } + + private final class CoreFieldIndex { + + final private long termBytesStart; + + // offset into index termBytes + final PackedInts.Reader termOffsets; + + // index pointers into main terms dict + final PackedInts.Reader termsDictOffsets; + + final int numIndexTerms; + + final long termsStart; + + public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException { + + this.termsStart = termsStart; + termBytesStart = termBytes.getPointer(); + + IndexInput clone = (IndexInput) in.clone(); + clone.seek(indexStart); + + // -1 is passed to mean "don't load term index", but + // if we are then later loaded it's overwritten with + // a real value + assert indexDivisor > 0; + + this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor; + + assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor; + + if (indexDivisor == 1) { + // Default (load all index terms) is fast -- slurp in the images from disk: + + try { + final long numTermBytes = packedIndexStart - indexStart; + termBytes.copy(clone, numTermBytes); + + // records offsets into main terms dict file + termsDictOffsets = PackedInts.getReader(clone); + assert termsDictOffsets.size() == numIndexTerms; + + // records offsets into byte[] term data + termOffsets = PackedInts.getReader(clone); + assert termOffsets.size() == 1+numIndexTerms; + } finally { + clone.close(); + } + } else { + // Get packed iterators + final IndexInput clone1 = (IndexInput) in.clone(); + final IndexInput clone2 = (IndexInput) in.clone(); + + try { + // Subsample the index terms + clone1.seek(packedIndexStart); + final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1); + + clone2.seek(packedOffsetsStart); + final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2); + + // TODO: often we can get by w/ fewer bits per + // value, below.. .but this'd be more complex: + // we'd have to try @ fewer bits and then grow + // if we overflowed it. + + PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue()); + PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue()); + + termsDictOffsets = termsDictOffsetsM; + termOffsets = termOffsetsM; + + int upto = 0; + + long termOffsetUpto = 0; + + while(upto < this.numIndexTerms) { + // main file offset copies straight over + termsDictOffsetsM.set(upto, termsDictOffsetsIter.next()); + + termOffsetsM.set(upto, termOffsetUpto); + upto++; + + long termOffset = termOffsetsIter.next(); + long nextTermOffset = termOffsetsIter.next(); + final int numTermBytes = (int) (nextTermOffset - termOffset); + + clone.seek(indexStart + termOffset); + assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length(); + assert indexStart + termOffset + numTermBytes < clone.length(); + + termBytes.copy(clone, numTermBytes); + termOffsetUpto += numTermBytes; + + // skip terms: + termsDictOffsetsIter.next(); + for(int i=0;i 0 : "totalIndexInterval=" + totalIndexInterval; + + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + + final long offset = termOffsets.get(mid); + final int length = (int) (termOffsets.get(1+mid) - offset); + termBytesReader.fill(result.term, termBytesStart + offset, length); + + int delta = termComp.compare(term, result.term); + if (delta < 0) { + hi = mid - 1; + } else if (delta > 0) { + lo = mid + 1; + } else { + assert mid >= 0; + result.position = mid*totalIndexInterval; + result.offset = termsStart + termsDictOffsets.get(mid); + return; + } + } + if (hi < 0) { + assert hi == -1; + hi = 0; + } + + final long offset = termOffsets.get(hi); + final int length = (int) (termOffsets.get(1+hi) - offset); + termBytesReader.fill(result.term, termBytesStart + offset, length); + + result.position = hi*totalIndexInterval; + result.offset = termsStart + termsDictOffsets.get(hi); + } + + public void getIndexOffset(long ord, TermsIndexResult result) throws IOException { + int idx = (int) (ord / totalIndexInterval); + // caller must ensure ord is in bounds + assert idx < numIndexTerms; + fillResult(idx, result); + } + } + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + if (!indexLoaded) { + + if (indexDivisor < 0) { + this.indexDivisor = -indexDivisor; + } else { + this.indexDivisor = indexDivisor; + } + this.totalIndexInterval = indexInterval * this.indexDivisor; + + Iterator it = fields.values().iterator(); + while(it.hasNext()) { + it.next().loadTermsIndex(); + } + + indexLoaded = true; + in.close(); + termBytesReader = termBytes.freeze(true); + } + } + + @Override + public FieldReader getField(FieldInfo fieldInfo) { + return fields.get(fieldInfo); + } + + public static void files(Directory dir, SegmentInfo info, Collection files) { + files.add(IndexFileNames.segmentFileName(info.name, "", SimpleTermsIndexWriter.TERMS_INDEX_EXTENSION)); + } + + public static void getIndexExtensions(Collection extensions) { + extensions.add(SimpleTermsIndexWriter.TERMS_INDEX_EXTENSION); + } + + @Override + public void getExtensions(Collection extensions) { + getIndexExtensions(extensions); + } + + @Override + public void close() throws IOException { + if (in != null && !indexLoaded) { + in.close(); + } + if (termBytesReader != null) { + termBytesReader.close(); + } + } + + protected void seekDir(IndexInput input, long dirOffset) throws IOException { + input.seek(dirOffset); + } +} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/SimpleTermsIndexWriter.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/SimpleTermsIndexWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -0,0 +1,216 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +import java.util.List; +import java.util.ArrayList; +import java.io.IOException; + +/** @lucene.experimental */ +public class SimpleTermsIndexWriter extends AbstractTermsIndexWriter { + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tii"; + + protected final IndexOutput out; + + final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX"; + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final private int termIndexInterval; + + private final List fields = new ArrayList(); + private final FieldInfos fieldInfos; // unread + private IndexOutput termsOut; + + public SimpleTermsIndexWriter(SegmentWriteState state) throws IOException { + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, "", TERMS_INDEX_EXTENSION); + state.flushedFiles.add(indexFileName); + termIndexInterval = state.termIndexInterval; + out = state.directory.createOutput(indexFileName); + fieldInfos = state.fieldInfos; + writeHeader(out); + out.writeInt(termIndexInterval); + } + + protected void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + // Placeholder for dir offset + out.writeLong(0); + } + + @Override + public void setTermsOutput(IndexOutput termsOut) { + this.termsOut = termsOut; + } + + @Override + public FieldWriter addField(FieldInfo field) { + SimpleFieldWriter writer = new SimpleFieldWriter(field); + fields.add(writer); + return writer; + } + + private class SimpleFieldWriter extends FieldWriter { + final FieldInfo fieldInfo; + int numIndexTerms; + final long indexStart; + final long termsStart; + long packedIndexStart; + long packedOffsetsStart; + private long numTerms; + + // TODO: we could conceivably make a PackedInts wrapper + // that auto-grows... then we wouldn't force 6 bytes RAM + // per index term: + private short[] termLengths; + private int[] termsPointerDeltas; + private long lastTermsPointer; + private long totTermLength; + + private final BytesRef lastTerm = new BytesRef(); + + SimpleFieldWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + indexStart = out.getFilePointer(); + termsStart = lastTermsPointer = termsOut.getFilePointer(); + termLengths = new short[0]; + termsPointerDeltas = new int[0]; + } + + @Override + public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { + // First term is first indexed term: + if (0 == (numTerms++ % termIndexInterval)) { + + // we can safely strip off the non-distinguishing + // suffix to save RAM in the loaded terms index. + final int limit = Math.min(lastTerm.length, text.length); + int minPrefixDiff = Math.min(1+lastTerm.length, text.length); + for(int byteIdx=0;byteIdx files) throws IOException { - StandardPostingsReaderImpl.files(dir, segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + StandardPostingsReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Thu Sep 16 05:41:06 2010 -0400 @@ -22,7 +22,8 @@ import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.codecs.standard.TermState; +import org.apache.lucene.index.codecs.TermState; +import org.apache.lucene.index.codecs.AbstractPostingsReader; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Document; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Position; @@ -39,13 +40,13 @@ // create two separate docs readers, one that also reads // prox and one that doesn't? -public class PulsingPostingsReaderImpl extends StandardPostingsReader { +public class PulsingPostingsReaderImpl extends AbstractPostingsReader { // Fallback reader for non-pulsed terms: - final StandardPostingsReader wrappedPostingsReader; + final AbstractPostingsReader wrappedPostingsReader; int maxPulsingDocFreq; - public PulsingPostingsReaderImpl(StandardPostingsReader wrappedPostingsReader) throws IOException { + public PulsingPostingsReaderImpl(AbstractPostingsReader wrappedPostingsReader) throws IOException { this.wrappedPostingsReader = wrappedPostingsReader; } diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Thu Sep 16 05:41:06 2010 -0400 @@ -22,6 +22,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -34,7 +35,7 @@ // presumably rare in practice... /** @lucene.experimental */ -public final class PulsingPostingsWriterImpl extends StandardPostingsWriter { +public final class PulsingPostingsWriterImpl extends AbstractPostingsWriter { final static String CODEC = "PulsedPostings"; @@ -110,11 +111,11 @@ // TODO: -- lazy init this? ie, if every single term // was pulsed then we never need to use this fallback? // Fallback writer for non-pulsed terms: - final StandardPostingsWriter wrappedPostingsWriter; + final AbstractPostingsWriter wrappedPostingsWriter; /** If docFreq <= maxPulsingDocFreq, its postings are * inlined into terms dict */ - public PulsingPostingsWriterImpl(int maxPulsingDocFreq, StandardPostingsWriter wrappedPostingsWriter) throws IOException { + public PulsingPostingsWriterImpl(int maxPulsingDocFreq, AbstractPostingsWriter wrappedPostingsWriter) throws IOException { super(); pendingDocs = new Document[maxPulsingDocFreq]; diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Thu Sep 16 05:41:06 2010 -0400 @@ -26,7 +26,8 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.TermState; +import org.apache.lucene.index.codecs.TermState; +import org.apache.lucene.index.codecs.AbstractPostingsReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; @@ -43,7 +44,7 @@ // create two separate docs readers, one that also reads // prox and one that doesn't? -public class SepPostingsReaderImpl extends StandardPostingsReader { +public class SepPostingsReaderImpl extends AbstractPostingsReader { final IntIndexInput freqIn; final IntIndexInput docIn; diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Thu Sep 16 05:41:06 2010 -0400 @@ -24,6 +24,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; @@ -33,7 +34,7 @@ * to .pyl, skip data to .skp * * @lucene.experimental */ -public final class SepPostingsWriterImpl extends StandardPostingsWriter { +public final class SepPostingsWriterImpl extends AbstractPostingsWriter { final static String CODEC = "SepDocFreqSkip"; final static String DOC_EXTENSION = "doc"; diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesReader.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.BytesRef; - -import java.io.IOException; - -// Handles reading incremental UTF8 encoded terms -final class DeltaBytesReader { - final BytesRef term = new BytesRef(); - final IndexInput in; - - DeltaBytesReader(IndexInput in) { - this.in = in; - term.bytes = new byte[10]; - } - - void reset(BytesRef text) { - term.copy(text); - } - - void read() throws IOException { - final int start = in.readVInt(); - final int suffix = in.readVInt(); - assert start <= term.length: "start=" + start + " length=" + term.length; - final int newLength = start+suffix; - term.grow(newLength); - in.readBytes(term.bytes, start, suffix); - term.length = newLength; - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.BytesRef; - -import java.io.IOException; - -final class DeltaBytesWriter { - - private byte[] lastBytes = new byte[10]; - private int lastLength; - final IndexOutput out; - - DeltaBytesWriter(IndexOutput out) { - this.out = out; - } - - void reset() { - lastLength = 0; - } - - void write(BytesRef text) throws IOException { - int start = 0; - int upto = text.offset; - final int length = text.length; - final byte[] bytes = text.bytes; - - final int limit = length < lastLength ? length : lastLength; - while(start < limit) { - if (bytes[upto] != lastBytes[start]) - break; - start++; - upto++; - } - - final int suffix = length - start; - out.writeVInt(start); // prefix - out.writeVInt(suffix); // suffix - out.writeBytes(bytes, upto, suffix); - if (lastBytes.length < length) { - lastBytes = ArrayUtil.grow(lastBytes, length); - } - // TODO: is this copy really necessary? I don't think - // caller actually modifies these bytes, so we can save - // by reference? - System.arraycopy(bytes, upto, lastBytes, start, suffix); - lastLength = length; - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,465 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CodecUtil; -import org.apache.lucene.util.PagedBytes; -import org.apache.lucene.util.packed.PackedInts; - -import java.util.HashMap; -import java.util.Iterator; -import java.util.Collection; -import java.util.Comparator; -import java.io.IOException; - -/** - * Uses a simplistic format to record terms dict index - * information. Limititations: - * - * - Index for all fields is loaded entirely into RAM up - * front - * - Index is stored in RAM using shared byte[] that - * wastefully expand every term. Using FST to share - * common prefix & suffix would save RAM. - * - Index is taken at regular numTerms (every 128 by - * default); might be better to do it by "net docFreqs" - * encountered, so that for spans of low-freq terms we - * take index less often. - * - * A better approach might be something similar to how - * postings are encoded, w/ multi-level skips. Ie, load all - * terms index data into memory, as a single large compactly - * encoded stream (eg delta bytes + delta offset). Index - * that w/ multi-level skipper. Then to look up a term is - * the equivalent binary search, using the skipper instead, - * while data remains compressed in memory. - */ - -import org.apache.lucene.index.IndexFileNames; - -/** @lucene.experimental */ -public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { - - // NOTE: long is overkill here, since this number is 128 - // by default and only indexDivisor * 128 if you change - // the indexDivisor at search time. But, we use this in a - // number of places to multiply out the actual ord, and we - // will overflow int during those multiplies. So to avoid - // having to upgrade each multiple to long in multiple - // places (error proned), we use long here: - private long totalIndexInterval; - - private int indexDivisor; - final private int indexInterval; - - // Closed if indexLoaded is true: - final private IndexInput in; - private volatile boolean indexLoaded; - - private final Comparator termComp; - - private final static int PAGED_BYTES_BITS = 15; - - // all fields share this single logical byte[] - private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); - private PagedBytes.Reader termBytesReader; - - final HashMap fields = new HashMap(); - - // start of the field info data - protected long dirOffset; - - public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp) - throws IOException { - - this.termComp = termComp; - - IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, "", StandardCodec.TERMS_INDEX_EXTENSION)); - - boolean success = false; - - try { - - readHeader(in); - indexInterval = in.readInt(); - this.indexDivisor = indexDivisor; - - if (indexDivisor < 0) { - totalIndexInterval = indexInterval; - } else { - // In case terms index gets loaded, later, on demand - totalIndexInterval = indexInterval * indexDivisor; - } - assert totalIndexInterval > 0; - - seekDir(in, dirOffset); - - // Read directory - final int numFields = in.readInt(); - - for(int i=0;i= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment; - if (numIndexTerms > 0) { - final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart)); - } - } - success = true; - } finally { - if (indexDivisor > 0) { - in.close(); - this.in = null; - if (success) { - indexLoaded = true; - } - termBytesReader = termBytes.freeze(true); - } else { - this.in = in; - } - } - } - - protected void readHeader(IndexInput input) throws IOException { - CodecUtil.checkHeader(input, SimpleStandardTermsIndexWriter.CODEC_NAME, - SimpleStandardTermsIndexWriter.VERSION_START, SimpleStandardTermsIndexWriter.VERSION_START); - dirOffset = input.readLong(); - } - - private final class FieldIndexReader extends FieldReader { - - final private FieldInfo fieldInfo; - - private volatile CoreFieldIndex coreIndex; - - private final IndexInput in; - - private final long indexStart; - private final long termsStart; - private final long packedIndexStart; - private final long packedOffsetsStart; - - private final int numIndexTerms; - - public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart, - long packedOffsetsStart) throws IOException { - - this.fieldInfo = fieldInfo; - this.in = in; - this.termsStart = termsStart; - this.indexStart = indexStart; - this.packedIndexStart = packedIndexStart; - this.packedOffsetsStart = packedOffsetsStart; - this.numIndexTerms = numIndexTerms; - - // We still create the indexReader when indexDivisor - // is -1, so that StandardTermsDictReader can call - // isIndexTerm for each field: - if (indexDivisor > 0) { - coreIndex = new CoreFieldIndex(indexStart, - termsStart, - packedIndexStart, - packedOffsetsStart, - numIndexTerms); - - } - } - - public void loadTermsIndex() throws IOException { - if (coreIndex == null) { - coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms); - } - } - - @Override - public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) { - if (onlyLoaded) { - return ord % totalIndexInterval == 0; - } else { - return ord % indexInterval == 0; - } - } - - @Override - public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException { - if (coreIndex == null) { - throw new IllegalStateException("terms index was not loaded"); - } else { - return coreIndex.nextIndexTerm(ord, result); - } - } - - @Override - public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException { - // You must call loadTermsIndex if you had specified -1 for indexDivisor - if (coreIndex == null) { - throw new IllegalStateException("terms index was not loaded"); - } - coreIndex.getIndexOffset(term, result); - } - - @Override - public void getIndexOffset(long ord, TermsIndexResult result) throws IOException { - // You must call loadTermsIndex if you had specified - // indexDivisor < 0 to ctor - if (coreIndex == null) { - throw new IllegalStateException("terms index was not loaded"); - } - coreIndex.getIndexOffset(ord, result); - } - - private final class CoreFieldIndex { - - final private long termBytesStart; - - // offset into index termBytes - final PackedInts.Reader termOffsets; - - // index pointers into main terms dict - final PackedInts.Reader termsDictOffsets; - - final int numIndexTerms; - - final long termsStart; - - public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException { - - this.termsStart = termsStart; - termBytesStart = termBytes.getPointer(); - - IndexInput clone = (IndexInput) in.clone(); - clone.seek(indexStart); - - // -1 is passed to mean "don't load term index", but - // if we are then later loaded it's overwritten with - // a real value - assert indexDivisor > 0; - - this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor; - - assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor; - - if (indexDivisor == 1) { - // Default (load all index terms) is fast -- slurp in the images from disk: - - try { - final long numTermBytes = packedIndexStart - indexStart; - termBytes.copy(clone, numTermBytes); - - // records offsets into main terms dict file - termsDictOffsets = PackedInts.getReader(clone); - assert termsDictOffsets.size() == numIndexTerms; - - // records offsets into byte[] term data - termOffsets = PackedInts.getReader(clone); - assert termOffsets.size() == 1+numIndexTerms; - } finally { - clone.close(); - } - } else { - // Get packed iterators - final IndexInput clone1 = (IndexInput) in.clone(); - final IndexInput clone2 = (IndexInput) in.clone(); - - try { - // Subsample the index terms - clone1.seek(packedIndexStart); - final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1); - - clone2.seek(packedOffsetsStart); - final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2); - - // TODO: often we can get by w/ fewer bits per - // value, below.. .but this'd be more complex: - // we'd have to try @ fewer bits and then grow - // if we overflowed it. - - PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue()); - PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue()); - - termsDictOffsets = termsDictOffsetsM; - termOffsets = termOffsetsM; - - int upto = 0; - - long termOffsetUpto = 0; - - while(upto < this.numIndexTerms) { - // main file offset copies straight over - termsDictOffsetsM.set(upto, termsDictOffsetsIter.next()); - - termOffsetsM.set(upto, termOffsetUpto); - upto++; - - long termOffset = termOffsetsIter.next(); - long nextTermOffset = termOffsetsIter.next(); - final int numTermBytes = (int) (nextTermOffset - termOffset); - - clone.seek(indexStart + termOffset); - assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length(); - assert indexStart + termOffset + numTermBytes < clone.length(); - - termBytes.copy(clone, numTermBytes); - termOffsetUpto += numTermBytes; - - // skip terms: - termsDictOffsetsIter.next(); - for(int i=0;i 0 : "totalIndexInterval=" + totalIndexInterval; - - while (hi >= lo) { - int mid = (lo + hi) >>> 1; - - final long offset = termOffsets.get(mid); - final int length = (int) (termOffsets.get(1+mid) - offset); - termBytesReader.fill(result.term, termBytesStart + offset, length); - - int delta = termComp.compare(term, result.term); - if (delta < 0) { - hi = mid - 1; - } else if (delta > 0) { - lo = mid + 1; - } else { - assert mid >= 0; - result.position = mid*totalIndexInterval; - result.offset = termsStart + termsDictOffsets.get(mid); - return; - } - } - if (hi < 0) { - assert hi == -1; - hi = 0; - } - - final long offset = termOffsets.get(hi); - final int length = (int) (termOffsets.get(1+hi) - offset); - termBytesReader.fill(result.term, termBytesStart + offset, length); - - result.position = hi*totalIndexInterval; - result.offset = termsStart + termsDictOffsets.get(hi); - } - - public void getIndexOffset(long ord, TermsIndexResult result) throws IOException { - int idx = (int) (ord / totalIndexInterval); - // caller must ensure ord is in bounds - assert idx < numIndexTerms; - fillResult(idx, result); - } - } - } - - @Override - public void loadTermsIndex(int indexDivisor) throws IOException { - if (!indexLoaded) { - - if (indexDivisor < 0) { - this.indexDivisor = -indexDivisor; - } else { - this.indexDivisor = indexDivisor; - } - this.totalIndexInterval = indexInterval * this.indexDivisor; - - Iterator it = fields.values().iterator(); - while(it.hasNext()) { - it.next().loadTermsIndex(); - } - - indexLoaded = true; - in.close(); - termBytesReader = termBytes.freeze(true); - } - } - - @Override - public FieldReader getField(FieldInfo fieldInfo) { - return fields.get(fieldInfo); - } - - public static void files(Directory dir, SegmentInfo info, Collection files) { - files.add(IndexFileNames.segmentFileName(info.name, "", StandardCodec.TERMS_INDEX_EXTENSION)); - } - - public static void getIndexExtensions(Collection extensions) { - extensions.add(StandardCodec.TERMS_INDEX_EXTENSION); - } - - @Override - public void getExtensions(Collection extensions) { - getIndexExtensions(extensions); - } - - @Override - public void close() throws IOException { - if (in != null && !indexLoaded) { - in.close(); - } - if (termBytesReader != null) { - termBytesReader.close(); - } - } - - protected void seekDir(IndexInput input, long dirOffset) throws IOException { - input.seek(dirOffset); - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,212 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CodecUtil; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.packed.PackedInts; - -import java.util.List; -import java.util.ArrayList; -import java.io.IOException; - -/** @lucene.experimental */ -public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter { - protected final IndexOutput out; - - final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX"; - final static int VERSION_START = 0; - final static int VERSION_CURRENT = VERSION_START; - - final private int termIndexInterval; - - private final List fields = new ArrayList(); - private final FieldInfos fieldInfos; // unread - private IndexOutput termsOut; - - public SimpleStandardTermsIndexWriter(SegmentWriteState state) throws IOException { - final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.TERMS_INDEX_EXTENSION); - state.flushedFiles.add(indexFileName); - termIndexInterval = state.termIndexInterval; - out = state.directory.createOutput(indexFileName); - fieldInfos = state.fieldInfos; - writeHeader(out); - out.writeInt(termIndexInterval); - } - - protected void writeHeader(IndexOutput out) throws IOException { - CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); - // Placeholder for dir offset - out.writeLong(0); - } - - @Override - public void setTermsOutput(IndexOutput termsOut) { - this.termsOut = termsOut; - } - - @Override - public FieldWriter addField(FieldInfo field) { - SimpleFieldWriter writer = new SimpleFieldWriter(field); - fields.add(writer); - return writer; - } - - private class SimpleFieldWriter extends FieldWriter { - final FieldInfo fieldInfo; - int numIndexTerms; - final long indexStart; - final long termsStart; - long packedIndexStart; - long packedOffsetsStart; - private long numTerms; - - // TODO: we could conceivably make a PackedInts wrapper - // that auto-grows... then we wouldn't force 6 bytes RAM - // per index term: - private short[] termLengths; - private int[] termsPointerDeltas; - private long lastTermsPointer; - private long totTermLength; - - private final BytesRef lastTerm = new BytesRef(); - - SimpleFieldWriter(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - indexStart = out.getFilePointer(); - termsStart = lastTermsPointer = termsOut.getFilePointer(); - termLengths = new short[0]; - termsPointerDeltas = new int[0]; - } - - @Override - public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException { - // First term is first indexed term: - if (0 == (numTerms++ % termIndexInterval)) { - - // we can safely strip off the non-distinguishing - // suffix to save RAM in the loaded terms index. - final int limit = Math.min(lastTerm.length, text.length); - int minPrefixDiff = Math.min(1+lastTerm.length, text.length); - for(int byteIdx=0;byteIdx files) throws IOException { - StandardPostingsReaderImpl.files(dir, segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + StandardPostingsReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override @@ -143,7 +145,7 @@ public static void getStandardExtensions(Set extensions) { extensions.add(FREQ_EXTENSION); extensions.add(PROX_EXTENSION); - StandardTermsDictReader.getExtensions(extensions); - SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + PrefixCodedTermsReader.getExtensions(extensions); + SimpleTermsIndexReader.getIndexExtensions(extensions); } } diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java Thu Sep 16 05:41:06 2010 -0400 @@ -18,39 +18,597 @@ */ import java.io.IOException; -import java.io.Closeable; +import java.util.Collection; +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.codecs.TermState; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.codecs.AbstractPostingsReader; +import org.apache.lucene.index.codecs.TermState; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; -/** StandardTermsDictReader interacts with a single instance - * of this to manage creation of {@link DocsEnum} and - * {@link DocsAndPositionsEnum} instances. It provides an - * IndexInput (termsIn) where this class may read any - * previously stored data that it had written in its - * corresponding {@link StandardPostingsWriter} at indexing - * time. +/** Concrete class that reads the current doc/freq/skip + * postings format. * @lucene.experimental */ -public abstract class StandardPostingsReader implements Closeable { +public class StandardPostingsReader extends AbstractPostingsReader { - public abstract void init(IndexInput termsIn) throws IOException; + private final IndexInput freqIn; + private final IndexInput proxIn; - /** Return a newly created empty TermState */ - public abstract TermState newTermState() throws IOException; + int skipInterval; + int maxSkipLevels; - public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException; + public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION), + readBufferSize); + if (segmentInfo.getHasProx()) { + boolean success = false; + try { + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION), + readBufferSize); + success = true; + } finally { + if (!success) { + freqIn.close(); + } + } + } else { + proxIn = null; + } + } - /** Must fully consume state, since after this call that - * TermState may be reused. */ - public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException; + public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION)); + if (segmentInfo.getHasProx()) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION)); + } + } - /** Must fully consume state, since after this call that - * TermState may be reused. */ - public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException; + @Override + public void init(IndexInput termsIn) throws IOException { - public abstract void close() throws IOException; + // Make sure we are talking to the matching past writer + CodecUtil.checkHeader(termsIn, StandardPostingsWriter.CODEC, + StandardPostingsWriter.VERSION_START, StandardPostingsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + } + + private static class DocTermState extends TermState { + long freqOffset; + long proxOffset; + int skipOffset; + + public Object clone() { + DocTermState other = (DocTermState) super.clone(); + other.freqOffset = freqOffset; + other.proxOffset = proxOffset; + other.skipOffset = skipOffset; + return other; + } + + public void copy(TermState _other) { + super.copy(_other); + DocTermState other = (DocTermState) _other; + freqOffset = other.freqOffset; + proxOffset = other.proxOffset; + skipOffset = other.skipOffset; + } + + public String toString() { + return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset; + } + } + + @Override + public TermState newTermState() { + return new DocTermState(); + } + + @Override + public void close() throws IOException { + try { + if (freqIn != null) { + freqIn.close(); + } + } finally { + if (proxIn != null) { + proxIn.close(); + } + } + } + + @Override + public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) + throws IOException { + + final DocTermState docTermState = (DocTermState) termState; + + if (isIndexTerm) { + docTermState.freqOffset = termsIn.readVLong(); + } else { + docTermState.freqOffset += termsIn.readVLong(); + } + + if (docTermState.docFreq >= skipInterval) { + docTermState.skipOffset = termsIn.readVInt(); + } else { + docTermState.skipOffset = 0; + } + + if (!fieldInfo.omitTermFreqAndPositions) { + if (isIndexTerm) { + docTermState.proxOffset = termsIn.readVLong(); + } else { + docTermState.proxOffset += termsIn.readVLong(); + } + } + } + + @Override + public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException { + SegmentDocsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentDocsEnum)) { + docsEnum = new SegmentDocsEnum(freqIn); + } else { + docsEnum = (SegmentDocsEnum) reuse; + if (docsEnum.startFreqIn != freqIn) { + // If you are using ParellelReader, and pass in a + // reused DocsEnum, it could have come from another + // reader also using standard codec + docsEnum = new SegmentDocsEnum(freqIn); + } + } + return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { + if (fieldInfo.omitTermFreqAndPositions) { + return null; + } + SegmentDocsAndPositionsEnum docsEnum; + if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) { + docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); + } else { + docsEnum = (SegmentDocsAndPositionsEnum) reuse; + if (docsEnum.startFreqIn != freqIn) { + // If you are using ParellelReader, and pass in a + // reused DocsEnum, it could have come from another + // reader also using standard codec + docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); + } + } + return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); + } + + // Decodes only docs + private class SegmentDocsEnum extends DocsEnum { + final IndexInput freqIn; + final IndexInput startFreqIn; + + boolean omitTF; // does current field omit term freq? + boolean storePayloads; // does current field store payloads? + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + + Bits skipDocs; + + long freqOffset; + int skipOffset; + + boolean skipped; + DefaultSkipListReader skipper; + + public SegmentDocsEnum(IndexInput freqIn) throws IOException { + startFreqIn = freqIn; + this.freqIn = (IndexInput) freqIn.clone(); + } + + public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { + omitTF = fieldInfo.omitTermFreqAndPositions; + if (omitTF) { + freq = 1; + } + storePayloads = fieldInfo.storePayloads; + this.skipDocs = skipDocs; + freqOffset = termState.freqOffset; + skipOffset = termState.skipOffset; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + limit = termState.docFreq; + ord = 0; + doc = 0; + + skipped = false; + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + return doc; + } + + @Override + public int read() throws IOException { + + final int[] docs = bulkResult.docs.ints; + final int[] freqs = bulkResult.freqs.ints; + int i = 0; + final int length = docs.length; + while (i < length && ord < limit) { + ord++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + } + + if (skipDocs == null || !skipDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset + skipOffset, + freqOffset, 0, + limit, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + } + } + + // scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + } + + // Decodes docs & positions + private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum { + final IndexInput startFreqIn; + private final IndexInput freqIn; + private final IndexInput proxIn; + + boolean storePayloads; // does current field store payloads? + + int limit; // number of docs in this posting + int ord; // how many docs we've read + int doc; // doc we last read + int freq; // freq we last read + int position; + + Bits skipDocs; + + long freqOffset; + int skipOffset; + long proxOffset; + + int posPendingCount; + int payloadLength; + boolean payloadPending; + + boolean skipped; + DefaultSkipListReader skipper; + private BytesRef payload; + private long lazyProxPointer; + + public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { + startFreqIn = freqIn; + this.freqIn = (IndexInput) freqIn.clone(); + this.proxIn = (IndexInput) proxIn.clone(); + } + + public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { + assert !fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + if (storePayloads && payload == null) { + payload = new BytesRef(); + payload.bytes = new byte[1]; + } + + this.skipDocs = skipDocs; + + // TODO: for full enum case (eg segment merging) this + // seek is unnecessary; maybe we can avoid in such + // cases + freqIn.seek(termState.freqOffset); + lazyProxPointer = termState.proxOffset; + + limit = termState.docFreq; + ord = 0; + doc = 0; + position = 0; + + skipped = false; + posPendingCount = 0; + payloadPending = false; + + freqOffset = termState.freqOffset; + proxOffset = termState.proxOffset; + skipOffset = termState.skipOffset; + + return this; + } + + @Override + public int nextDoc() throws IOException { + while(true) { + if (ord == limit) { + return doc = NO_MORE_DOCS; + } + + ord++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) { // if low bit is set + freq = 1; // freq is one + } else { + freq = freqIn.readVInt(); // else read freq + } + posPendingCount += freq; + + if (skipDocs == null || !skipDocs.get(doc)) { + break; + } + } + + position = 0; + + return doc; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int advance(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + + if (skipper == null) { + // This is the first time this enum has ever been used for skipping -- do lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + } + + if (!skipped) { + + // This is the first time this posting has + // skipped, since reset() was called, so now we + // load the skip data for this posting + + skipper.init(freqOffset+skipOffset, + freqOffset, proxOffset, + limit, storePayloads); + + skipped = true; + } + + final int newOrd = skipper.skipTo(target); + + if (newOrd > ord) { + // Skipper moved + ord = newOrd; + doc = skipper.getDoc(); + freqIn.seek(skipper.getFreqPointer()); + lazyProxPointer = skipper.getProxPointer(); + posPendingCount = 0; + position = 0; + payloadPending = false; + payloadLength = skipper.getPayloadLength(); + } + } + + // Now, linear scan for the rest: + do { + nextDoc(); + } while (target > doc); + + return doc; + } + + public int nextPosition() throws IOException { + + if (lazyProxPointer != -1) { + proxIn.seek(lazyProxPointer); + lazyProxPointer = -1; + } + + if (payloadPending && payloadLength > 0) { + // payload of last position as never retrieved -- skip it + proxIn.seek(proxIn.getFilePointer() + payloadLength); + payloadPending = false; + } + + // scan over any docs that were iterated without their positions + while(posPendingCount > freq) { + + final int code = proxIn.readVInt(); + + if (storePayloads) { + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; + proxIn.seek(proxIn.getFilePointer() + payloadLength); + } + + posPendingCount--; + position = 0; + payloadPending = false; + } + + // read next position + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + // payload wasn't retrieved for last position + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // new payload length + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else { + position += proxIn.readVInt(); + } + + posPendingCount--; + + assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount; + + return position; + } + + /** Returns length of payload at current position */ + public int getPayloadLength() { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + return payloadLength; + } + + /** Returns the payload at this position, or null if no + * payload was indexed. */ + public BytesRef getPayload() throws IOException { + assert lazyProxPointer == -1; + assert posPendingCount < freq; + if (!payloadPending) { + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + } + if (payloadLength > payload.bytes.length) { + payload.grow(payloadLength); + } + proxIn.readBytes(payload.bytes, 0, payloadLength); + payload.length = payloadLength; + payloadPending = false; + + return payload; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } } diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,611 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Collection; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CodecUtil; - -/** Concrete class that reads the current doc/freq/skip - * postings format. - * @lucene.experimental */ - -public class StandardPostingsReaderImpl extends StandardPostingsReader { - - private final IndexInput freqIn; - private final IndexInput proxIn; - - int skipInterval; - int maxSkipLevels; - - public StandardPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { - freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION), - readBufferSize); - if (segmentInfo.getHasProx()) { - boolean success = false; - try { - proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION), - readBufferSize); - success = true; - } finally { - if (!success) { - freqIn.close(); - } - } - } else { - proxIn = null; - } - } - - public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) throws IOException { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION)); - if (segmentInfo.getHasProx()) { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION)); - } - } - - @Override - public void init(IndexInput termsIn) throws IOException { - - // Make sure we are talking to the matching past writer - CodecUtil.checkHeader(termsIn, StandardPostingsWriterImpl.CODEC, - StandardPostingsWriterImpl.VERSION_START, StandardPostingsWriterImpl.VERSION_START); - - skipInterval = termsIn.readInt(); - maxSkipLevels = termsIn.readInt(); - } - - private static class DocTermState extends TermState { - long freqOffset; - long proxOffset; - int skipOffset; - - public Object clone() { - DocTermState other = (DocTermState) super.clone(); - other.freqOffset = freqOffset; - other.proxOffset = proxOffset; - other.skipOffset = skipOffset; - return other; - } - - public void copy(TermState _other) { - super.copy(_other); - DocTermState other = (DocTermState) _other; - freqOffset = other.freqOffset; - proxOffset = other.proxOffset; - skipOffset = other.skipOffset; - } - - public String toString() { - return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset; - } - } - - @Override - public TermState newTermState() { - return new DocTermState(); - } - - @Override - public void close() throws IOException { - try { - if (freqIn != null) { - freqIn.close(); - } - } finally { - if (proxIn != null) { - proxIn.close(); - } - } - } - - @Override - public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) - throws IOException { - - final DocTermState docTermState = (DocTermState) termState; - - if (isIndexTerm) { - docTermState.freqOffset = termsIn.readVLong(); - } else { - docTermState.freqOffset += termsIn.readVLong(); - } - - if (docTermState.docFreq >= skipInterval) { - docTermState.skipOffset = termsIn.readVInt(); - } else { - docTermState.skipOffset = 0; - } - - if (!fieldInfo.omitTermFreqAndPositions) { - if (isIndexTerm) { - docTermState.proxOffset = termsIn.readVLong(); - } else { - docTermState.proxOffset += termsIn.readVLong(); - } - } - } - - @Override - public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException { - SegmentDocsEnum docsEnum; - if (reuse == null || !(reuse instanceof SegmentDocsEnum)) { - docsEnum = new SegmentDocsEnum(freqIn); - } else { - docsEnum = (SegmentDocsEnum) reuse; - if (docsEnum.startFreqIn != freqIn) { - // If you are using ParellelReader, and pass in a - // reused DocsEnum, it could have come from another - // reader also using standard codec - docsEnum = new SegmentDocsEnum(freqIn); - } - } - return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { - if (fieldInfo.omitTermFreqAndPositions) { - return null; - } - SegmentDocsAndPositionsEnum docsEnum; - if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) { - docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); - } else { - docsEnum = (SegmentDocsAndPositionsEnum) reuse; - if (docsEnum.startFreqIn != freqIn) { - // If you are using ParellelReader, and pass in a - // reused DocsEnum, it could have come from another - // reader also using standard codec - docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn); - } - } - return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs); - } - - // Decodes only docs - private class SegmentDocsEnum extends DocsEnum { - final IndexInput freqIn; - final IndexInput startFreqIn; - - boolean omitTF; // does current field omit term freq? - boolean storePayloads; // does current field store payloads? - - int limit; // number of docs in this posting - int ord; // how many docs we've read - int doc; // doc we last read - int freq; // freq we last read - - Bits skipDocs; - - long freqOffset; - int skipOffset; - - boolean skipped; - DefaultSkipListReader skipper; - - public SegmentDocsEnum(IndexInput freqIn) throws IOException { - startFreqIn = freqIn; - this.freqIn = (IndexInput) freqIn.clone(); - } - - public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { - omitTF = fieldInfo.omitTermFreqAndPositions; - if (omitTF) { - freq = 1; - } - storePayloads = fieldInfo.storePayloads; - this.skipDocs = skipDocs; - freqOffset = termState.freqOffset; - skipOffset = termState.skipOffset; - - // TODO: for full enum case (eg segment merging) this - // seek is unnecessary; maybe we can avoid in such - // cases - freqIn.seek(termState.freqOffset); - limit = termState.docFreq; - ord = 0; - doc = 0; - - skipped = false; - - return this; - } - - @Override - public int nextDoc() throws IOException { - while(true) { - if (ord == limit) { - return doc = NO_MORE_DOCS; - } - - ord++; - - // Decode next doc/freq pair - final int code = freqIn.readVInt(); - if (omitTF) { - doc += code; - } else { - doc += code >>> 1; // shift off low bit - if ((code & 1) != 0) { // if low bit is set - freq = 1; // freq is one - } else { - freq = freqIn.readVInt(); // else read freq - } - } - - if (skipDocs == null || !skipDocs.get(doc)) { - break; - } - } - - return doc; - } - - @Override - public int read() throws IOException { - - final int[] docs = bulkResult.docs.ints; - final int[] freqs = bulkResult.freqs.ints; - int i = 0; - final int length = docs.length; - while (i < length && ord < limit) { - ord++; - // manually inlined call to next() for speed - final int code = freqIn.readVInt(); - if (omitTF) { - doc += code; - } else { - doc += code >>> 1; // shift off low bit - if ((code & 1) != 0) { // if low bit is set - freq = 1; // freq is one - } else { - freq = freqIn.readVInt(); // else read freq - } - } - - if (skipDocs == null || !skipDocs.get(doc)) { - docs[i] = doc; - freqs[i] = freq; - ++i; - } - } - - return i; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int freq() { - return freq; - } - - @Override - public int advance(int target) throws IOException { - - // TODO: jump right to next() if target is < X away - // from where we are now? - - if (skipOffset > 0) { - - // There are enough docs in the posting to have - // skip data - - if (skipper == null) { - // This is the first time this enum has ever been used for skipping -- do lazy init - skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); - } - - if (!skipped) { - - // This is the first time this posting has - // skipped since reset() was called, so now we - // load the skip data for this posting - - skipper.init(freqOffset + skipOffset, - freqOffset, 0, - limit, storePayloads); - - skipped = true; - } - - final int newOrd = skipper.skipTo(target); - - if (newOrd > ord) { - // Skipper moved - - ord = newOrd; - doc = skipper.getDoc(); - freqIn.seek(skipper.getFreqPointer()); - } - } - - // scan for the rest: - do { - nextDoc(); - } while (target > doc); - - return doc; - } - } - - // Decodes docs & positions - private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum { - final IndexInput startFreqIn; - private final IndexInput freqIn; - private final IndexInput proxIn; - - boolean storePayloads; // does current field store payloads? - - int limit; // number of docs in this posting - int ord; // how many docs we've read - int doc; // doc we last read - int freq; // freq we last read - int position; - - Bits skipDocs; - - long freqOffset; - int skipOffset; - long proxOffset; - - int posPendingCount; - int payloadLength; - boolean payloadPending; - - boolean skipped; - DefaultSkipListReader skipper; - private BytesRef payload; - private long lazyProxPointer; - - public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException { - startFreqIn = freqIn; - this.freqIn = (IndexInput) freqIn.clone(); - this.proxIn = (IndexInput) proxIn.clone(); - } - - public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException { - assert !fieldInfo.omitTermFreqAndPositions; - storePayloads = fieldInfo.storePayloads; - if (storePayloads && payload == null) { - payload = new BytesRef(); - payload.bytes = new byte[1]; - } - - this.skipDocs = skipDocs; - - // TODO: for full enum case (eg segment merging) this - // seek is unnecessary; maybe we can avoid in such - // cases - freqIn.seek(termState.freqOffset); - lazyProxPointer = termState.proxOffset; - - limit = termState.docFreq; - ord = 0; - doc = 0; - position = 0; - - skipped = false; - posPendingCount = 0; - payloadPending = false; - - freqOffset = termState.freqOffset; - proxOffset = termState.proxOffset; - skipOffset = termState.skipOffset; - - return this; - } - - @Override - public int nextDoc() throws IOException { - while(true) { - if (ord == limit) { - return doc = NO_MORE_DOCS; - } - - ord++; - - // Decode next doc/freq pair - final int code = freqIn.readVInt(); - - doc += code >>> 1; // shift off low bit - if ((code & 1) != 0) { // if low bit is set - freq = 1; // freq is one - } else { - freq = freqIn.readVInt(); // else read freq - } - posPendingCount += freq; - - if (skipDocs == null || !skipDocs.get(doc)) { - break; - } - } - - position = 0; - - return doc; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int freq() { - return freq; - } - - @Override - public int advance(int target) throws IOException { - - // TODO: jump right to next() if target is < X away - // from where we are now? - - if (skipOffset > 0) { - - // There are enough docs in the posting to have - // skip data - - if (skipper == null) { - // This is the first time this enum has ever been used for skipping -- do lazy init - skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); - } - - if (!skipped) { - - // This is the first time this posting has - // skipped, since reset() was called, so now we - // load the skip data for this posting - - skipper.init(freqOffset+skipOffset, - freqOffset, proxOffset, - limit, storePayloads); - - skipped = true; - } - - final int newOrd = skipper.skipTo(target); - - if (newOrd > ord) { - // Skipper moved - ord = newOrd; - doc = skipper.getDoc(); - freqIn.seek(skipper.getFreqPointer()); - lazyProxPointer = skipper.getProxPointer(); - posPendingCount = 0; - position = 0; - payloadPending = false; - payloadLength = skipper.getPayloadLength(); - } - } - - // Now, linear scan for the rest: - do { - nextDoc(); - } while (target > doc); - - return doc; - } - - public int nextPosition() throws IOException { - - if (lazyProxPointer != -1) { - proxIn.seek(lazyProxPointer); - lazyProxPointer = -1; - } - - if (payloadPending && payloadLength > 0) { - // payload of last position as never retrieved -- skip it - proxIn.seek(proxIn.getFilePointer() + payloadLength); - payloadPending = false; - } - - // scan over any docs that were iterated without their positions - while(posPendingCount > freq) { - - final int code = proxIn.readVInt(); - - if (storePayloads) { - if ((code & 1) != 0) { - // new payload length - payloadLength = proxIn.readVInt(); - assert payloadLength >= 0; - } - assert payloadLength != -1; - proxIn.seek(proxIn.getFilePointer() + payloadLength); - } - - posPendingCount--; - position = 0; - payloadPending = false; - } - - // read next position - if (storePayloads) { - - if (payloadPending && payloadLength > 0) { - // payload wasn't retrieved for last position - proxIn.seek(proxIn.getFilePointer()+payloadLength); - } - - final int code = proxIn.readVInt(); - if ((code & 1) != 0) { - // new payload length - payloadLength = proxIn.readVInt(); - assert payloadLength >= 0; - } - assert payloadLength != -1; - - payloadPending = true; - position += code >>> 1; - } else { - position += proxIn.readVInt(); - } - - posPendingCount--; - - assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount; - - return position; - } - - /** Returns length of payload at current position */ - public int getPayloadLength() { - assert lazyProxPointer == -1; - assert posPendingCount < freq; - return payloadLength; - } - - /** Returns the payload at this position, or null if no - * payload was indexed. */ - public BytesRef getPayload() throws IOException { - assert lazyProxPointer == -1; - assert posPendingCount < freq; - if (!payloadPending) { - throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); - } - if (payloadLength > payload.bytes.length) { - payload.grow(payloadLength); - } - proxIn.readBytes(payload.bytes, 0, payloadLength); - payload.length = payloadLength; - payloadPending = false; - - return payload; - } - - public boolean hasPayload() { - return payloadPending && payloadLength > 0; - } - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Thu Sep 16 05:41:06 2010 -0400 @@ -17,27 +17,219 @@ * limitations under the License. */ +/** Consumes doc & freq, writing them using the current + * index file format */ + import java.io.IOException; -import java.io.Closeable; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; -/** - * @lucene.experimental - */ +/** @lucene.experimental */ +public final class StandardPostingsWriter extends AbstractPostingsWriter { + final static String CODEC = "StandardPostingsWriterImpl"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; -public abstract class StandardPostingsWriter extends PostingsConsumer implements Closeable { + final IndexOutput freqOut; + final IndexOutput proxOut; + final DefaultSkipListWriter skipListWriter; + final int skipInterval; + final int maxSkipLevels; + final int totalNumDocs; + IndexOutput termsOut; - public abstract void start(IndexOutput termsOut) throws IOException; + boolean omitTermFreqAndPositions; + boolean storePayloads; + // Starts a new term + long lastFreqStart; + long freqStart; + long lastProxStart; + long proxStart; + FieldInfo fieldInfo; + int lastPayloadLength; + int lastPosition; - public abstract void startTerm() throws IOException; + public StandardPostingsWriter(SegmentWriteState state) throws IOException { + super(); + String fileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.FREQ_EXTENSION); + state.flushedFiles.add(fileName); + freqOut = state.directory.createOutput(fileName); - /** Finishes the current term */ - public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + if (state.fieldInfos.hasProx()) { + // At least one field does not omit TF, so create the + // prox file + fileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.PROX_EXTENSION); + state.flushedFiles.add(fileName); + proxOut = state.directory.createOutput(fileName); + } else { + // Every field omits TF so we will write no prox file + proxOut = null; + } - public abstract void setField(FieldInfo fieldInfo); + totalNumDocs = state.numDocs; - public abstract void close() throws IOException; + skipListWriter = new DefaultSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + freqOut, + proxOut); + + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + } + + @Override + public void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + } + + @Override + public void startTerm() { + freqStart = freqOut.getFilePointer(); + if (proxOut != null) { + proxStart = proxOut.getFilePointer(); + // force first payload to write its length + lastPayloadLength = -1; + } + skipListWriter.resetSkip(); + } + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + @Override + public void setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + int lastDocID; + int df; + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + + final int delta = docID - lastDocID; + + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + lastDocID = docID; + if (omitTermFreqAndPositions) { + freqOut.writeVInt(delta); + } else if (1 == termDocFreq) { + freqOut.writeVInt((delta<<1) | 1); + } else { + freqOut.writeVInt(delta<<1); + freqOut.writeVInt(termDocFreq); + } + + lastPosition = 0; + } + + /** Add a new position & payload */ + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; + assert proxOut != null; + + final int delta = position - lastPosition; + + assert delta > 0 || position == 0 || position == -1: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) + + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() { + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + assert docCount > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert docCount == df; + + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(freqStart); + } else { + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); + } + + lastFreqStart = freqStart; + + if (df >= skipInterval) { + termsOut.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart)); + } + + if (!omitTermFreqAndPositions) { + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(proxStart); + } else { + // Write delta between seek points + termsOut.writeVLong(proxStart - lastProxStart); + } + lastProxStart = proxStart; + } + + lastDocID = 0; + df = 0; + } + + @Override + public void close() throws IOException { + try { + freqOut.close(); + } finally { + if (proxOut != null) { + proxOut.close(); + } + } + } } diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriterImpl.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,234 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Consumes doc & freq, writing them using the current - * index file format */ - -import java.io.IOException; - -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CodecUtil; - -/** @lucene.experimental */ -public final class StandardPostingsWriterImpl extends StandardPostingsWriter { - final static String CODEC = "StandardPostingsWriterImpl"; - - // Increment version to change it: - final static int VERSION_START = 0; - final static int VERSION_CURRENT = VERSION_START; - - final IndexOutput freqOut; - final IndexOutput proxOut; - final DefaultSkipListWriter skipListWriter; - final int skipInterval; - final int maxSkipLevels; - final int totalNumDocs; - IndexOutput termsOut; - - boolean omitTermFreqAndPositions; - boolean storePayloads; - // Starts a new term - long lastFreqStart; - long freqStart; - long lastProxStart; - long proxStart; - FieldInfo fieldInfo; - int lastPayloadLength; - int lastPosition; - - public StandardPostingsWriterImpl(SegmentWriteState state) throws IOException { - super(); - String fileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.FREQ_EXTENSION); - state.flushedFiles.add(fileName); - freqOut = state.directory.createOutput(fileName); - - if (state.fieldInfos.hasProx()) { - // At least one field does not omit TF, so create the - // prox file - fileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.PROX_EXTENSION); - state.flushedFiles.add(fileName); - proxOut = state.directory.createOutput(fileName); - } else { - // Every field omits TF so we will write no prox file - proxOut = null; - } - - totalNumDocs = state.numDocs; - - skipListWriter = new DefaultSkipListWriter(state.skipInterval, - state.maxSkipLevels, - state.numDocs, - freqOut, - proxOut); - - skipInterval = state.skipInterval; - maxSkipLevels = state.maxSkipLevels; - } - - @Override - public void start(IndexOutput termsOut) throws IOException { - this.termsOut = termsOut; - CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); - termsOut.writeInt(skipInterval); // write skipInterval - termsOut.writeInt(maxSkipLevels); // write maxSkipLevels - } - - @Override - public void startTerm() { - freqStart = freqOut.getFilePointer(); - if (proxOut != null) { - proxStart = proxOut.getFilePointer(); - // force first payload to write its length - lastPayloadLength = -1; - } - skipListWriter.resetSkip(); - } - - // Currently, this instance is re-used across fields, so - // our parent calls setField whenever the field changes - @Override - public void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; - storePayloads = fieldInfo.storePayloads; - } - - int lastDocID; - int df; - - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ - @Override - public void startDoc(int docID, int termDocFreq) throws IOException { - - final int delta = docID - lastDocID; - - if (docID < 0 || (df > 0 && delta <= 0)) { - throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); - } - - if ((++df % skipInterval) == 0) { - skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); - skipListWriter.bufferSkip(df); - } - - assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; - - lastDocID = docID; - if (omitTermFreqAndPositions) { - freqOut.writeVInt(delta); - } else if (1 == termDocFreq) { - freqOut.writeVInt((delta<<1) | 1); - } else { - freqOut.writeVInt(delta<<1); - freqOut.writeVInt(termDocFreq); - } - - lastPosition = 0; - } - - /** Add a new position & payload */ - @Override - public void addPosition(int position, BytesRef payload) throws IOException { - assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; - assert proxOut != null; - - final int delta = position - lastPosition; - - assert delta > 0 || position == 0 || position == -1: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) - - lastPosition = position; - - if (storePayloads) { - final int payloadLength = payload == null ? 0 : payload.length; - - if (payloadLength != lastPayloadLength) { - lastPayloadLength = payloadLength; - proxOut.writeVInt((delta<<1)|1); - proxOut.writeVInt(payloadLength); - } else { - proxOut.writeVInt(delta << 1); - } - - if (payloadLength > 0) { - proxOut.writeBytes(payload.bytes, payload.offset, payloadLength); - } - } else { - proxOut.writeVInt(delta); - } - } - - @Override - public void finishDoc() { - } - - /** Called when we are done adding docs to this term */ - @Override - public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { - assert docCount > 0; - - // TODO: wasteful we are counting this (counting # docs - // for this term) in two places? - assert docCount == df; - - if (isIndexTerm) { - // Write absolute at seek points - termsOut.writeVLong(freqStart); - } else { - // Write delta between seek points - termsOut.writeVLong(freqStart - lastFreqStart); - } - - lastFreqStart = freqStart; - - if (df >= skipInterval) { - termsOut.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart)); - } - - if (!omitTermFreqAndPositions) { - if (isIndexTerm) { - // Write absolute at seek points - termsOut.writeVLong(proxStart); - } else { - // Write delta between seek points - termsOut.writeVLong(proxStart - lastProxStart); - } - lastProxStart = proxStart; - } - - lastDocID = 0; - df = 0; - } - - @Override - public void close() throws IOException { - try { - freqOut.close(); - } finally { - if (proxOut != null) { - proxOut.close(); - } - } - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,498 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Closeable; -import java.util.Collection; -import java.util.Iterator; -import java.util.TreeMap; -import java.util.Comparator; - -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FieldsEnum; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.codecs.FieldsProducer; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.DoubleBarrelLRUCache; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CodecUtil; - -/** Handles a terms dict, but decouples all details of - * doc/freqs/positions reading to an instance of {@link - * StandardPostingsReader}. This class is reusable for - * codecs that use a different format for - * docs/freqs/positions (though codecs are also free to - * make their own terms dict impl). - * - *

This class also interacts with an instance of {@link - * StandardTermsIndexReader}, to abstract away the specific - * implementation of the terms dict index. - * @lucene.experimental */ - -public class StandardTermsDictReader extends FieldsProducer { - // Open input to the main terms dict file (_X.tis) - private final IndexInput in; - - // Reads the terms dict entries, to gather state to - // produce DocsEnum on demand - private final StandardPostingsReader postingsReader; - - private final TreeMap fields = new TreeMap(); - - // Comparator that orders our terms - private final Comparator termComp; - - // Caches the most recently looked-up field + terms: - private final DoubleBarrelLRUCache termsCache; - - // Reads the terms index - private StandardTermsIndexReader indexReader; - - // keeps the dirStart offset - protected long dirOffset; - - // Used as key for the terms cache - private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey { - String field; - BytesRef term; - - public FieldAndTerm() { - } - - public FieldAndTerm(FieldAndTerm other) { - field = other.field; - term = new BytesRef(other.term); - } - - @Override - public boolean equals(Object _other) { - FieldAndTerm other = (FieldAndTerm) _other; - return other.field == field && term.bytesEquals(other.term); - } - - @Override - public Object clone() { - return new FieldAndTerm(this); - } - - @Override - public int hashCode() { - return field.hashCode() * 31 + term.hashCode(); - } - } - - public StandardTermsDictReader(StandardTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, StandardPostingsReader postingsReader, int readBufferSize, - Comparator termComp, int termsCacheSize) - throws IOException { - - this.postingsReader = postingsReader; - termsCache = new DoubleBarrelLRUCache(termsCacheSize); - - this.termComp = termComp; - - in = dir.openInput(IndexFileNames.segmentFileName(segment, "", StandardCodec.TERMS_EXTENSION), - readBufferSize); - - boolean success = false; - try { - readHeader(in); - - // Have PostingsReader init itself - postingsReader.init(in); - - // Read per-field details - seekDir(in, dirOffset); - - final int numFields = in.readInt(); - - for(int i=0;i= 0; - final long termsStartPointer = in.readLong(); - final StandardTermsIndexReader.FieldReader fieldIndexReader; - final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - fieldIndexReader = indexReader.getField(fieldInfo); - if (numTerms > 0) { - assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer)); - } - } - success = true; - } finally { - if (!success) { - in.close(); - } - } - - this.indexReader = indexReader; - } - - protected void readHeader(IndexInput input) throws IOException { - CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME, - StandardTermsDictWriter.VERSION_START, StandardTermsDictWriter.VERSION_CURRENT); - dirOffset = in.readLong(); - } - - protected void seekDir(IndexInput input, long dirOffset) - throws IOException { - input.seek(dirOffset); - } - - @Override - public void loadTermsIndex(int indexDivisor) throws IOException { - indexReader.loadTermsIndex(indexDivisor); - } - - @Override - public void close() throws IOException { - try { - try { - if (indexReader != null) { - indexReader.close(); - } - } finally { - // null so if an app hangs on to us (ie, we are not - // GCable, despite being closed) we still free most - // ram - indexReader = null; - if (in != null) { - in.close(); - } - } - } finally { - try { - if (postingsReader != null) { - postingsReader.close(); - } - } finally { - for(FieldReader field : fields.values()) { - field.close(); - } - } - } - } - - public static void files(Directory dir, SegmentInfo segmentInfo, Collection files) { - files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.TERMS_EXTENSION)); - } - - public static void getExtensions(Collection extensions) { - extensions.add(StandardCodec.TERMS_EXTENSION); - } - - @Override - public FieldsEnum iterator() { - return new TermFieldsEnum(); - } - - @Override - public Terms terms(String field) throws IOException { - return fields.get(field); - } - - // Iterates through all fields - private class TermFieldsEnum extends FieldsEnum { - final Iterator it; - FieldReader current; - - TermFieldsEnum() { - it = fields.values().iterator(); - } - - @Override - public String next() { - if (it.hasNext()) { - current = it.next(); - return current.fieldInfo.name; - } else { - current = null; - return null; - } - } - - @Override - public TermsEnum terms() throws IOException { - return current.iterator(); - } - } - - private class FieldReader extends Terms implements Closeable { - final long numTerms; - final FieldInfo fieldInfo; - final long termsStartPointer; - final StandardTermsIndexReader.FieldReader fieldIndexReader; - - FieldReader(StandardTermsIndexReader.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) { - assert numTerms > 0; - this.fieldInfo = fieldInfo; - this.numTerms = numTerms; - this.termsStartPointer = termsStartPointer; - this.fieldIndexReader = fieldIndexReader; - } - - @Override - public Comparator getComparator() { - return termComp; - } - - @Override - public void close() { - super.close(); - } - - @Override - public TermsEnum iterator() throws IOException { - return new SegmentTermsEnum(); - } - - @Override - public long getUniqueTermCount() { - return numTerms; - } - - // Iterates through terms in this field - private class SegmentTermsEnum extends TermsEnum { - private final IndexInput in; - private final DeltaBytesReader bytesReader; - private final TermState state; - private boolean seekPending; - private final StandardTermsIndexReader.TermsIndexResult indexResult = new StandardTermsIndexReader.TermsIndexResult(); - private final FieldAndTerm fieldTerm = new FieldAndTerm(); - - SegmentTermsEnum() throws IOException { - in = (IndexInput) StandardTermsDictReader.this.in.clone(); - in.seek(termsStartPointer); - bytesReader = new DeltaBytesReader(in); - fieldTerm.field = fieldInfo.name; - state = postingsReader.newTermState(); - state.ord = -1; - } - - @Override - public Comparator getComparator() { - return termComp; - } - - /** Seeks until the first term that's >= the provided - * text; returns SeekStatus.FOUND if the exact term - * is found, SeekStatus.NOT_FOUND if a different term - * was found, SeekStatus.END if we hit EOF */ - @Override - public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { - // Check cache - fieldTerm.term = term; - TermState cachedState; - if (useCache) { - cachedState = termsCache.get(fieldTerm); - if (cachedState != null) { - state.copy(cachedState); - seekPending = true; - bytesReader.term.copy(term); - return SeekStatus.FOUND; - } - } else { - cachedState = null; - } - - boolean doSeek = true; - - if (state.ord != -1) { - // we are positioned - - final int cmp = termComp.compare(bytesReader.term, term); - - if (cmp == 0) { - // already at the requested term - return SeekStatus.FOUND; - } - - if (cmp < 0 && - fieldIndexReader.nextIndexTerm(state.ord, indexResult) && - termComp.compare(indexResult.term, term) > 0) { - // Optimization: requested term is within the - // same index block we are now in; skip seeking - // (but do scanning): - doSeek = false; - } - } - - // Used only for assert: - final long startOrd; - - if (doSeek) { - - // As index to find biggest index term that's <= - // our text: - fieldIndexReader.getIndexOffset(term, indexResult); - - in.seek(indexResult.offset); - seekPending = false; - - // NOTE: the first next() after an index seek is - // wasteful, since it redundantly reads the same - // bytes into the buffer. We could avoid storing - // those bytes in the primary file, but then when - // scanning over an index term we'd have to - // special case it: - bytesReader.reset(indexResult.term); - - state.ord = indexResult.position-1; - assert state.ord >= -1: "ord=" + state.ord + " pos=" + indexResult.position; - - startOrd = indexResult.position; - } else { - startOrd = -1; - } - - // Now scan: - while(next() != null) { - final int cmp = termComp.compare(bytesReader.term, term); - if (cmp == 0) { - - if (doSeek && useCache) { - // Store in cache - FieldAndTerm entryKey = new FieldAndTerm(fieldTerm); - cachedState = (TermState) state.clone(); - // this is fp after current term - cachedState.filePointer = in.getFilePointer(); - termsCache.put(entryKey, cachedState); - } - - return SeekStatus.FOUND; - } else if (cmp > 0) { - return SeekStatus.NOT_FOUND; - } - // The purpose of the terms dict index is to seek - // the enum to the closest index term before the - // term we are looking for. So, we should never - // cross another index term (besides the first - // one) while we are scanning: - assert state.ord == startOrd || !fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true): "state.ord=" + state.ord + " startOrd=" + startOrd + " ir.isIndexTerm=" + fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true) + " state.docFreq=" + state.docFreq; - } - - return SeekStatus.END; - } - - @Override - public SeekStatus seek(long ord) throws IOException { - - // TODO: should we cache term lookup by ord as well...? - - if (ord >= numTerms) { - state.ord = numTerms-1; - return SeekStatus.END; - } - - fieldIndexReader.getIndexOffset(ord, indexResult); - in.seek(indexResult.offset); - seekPending = false; - - // NOTE: the first next() after an index seek is - // wasteful, since it redundantly reads the same - // bytes into the buffer - bytesReader.reset(indexResult.term); - - state.ord = indexResult.position-1; - assert state.ord >= -1: "ord=" + state.ord; - - // Now, scan: - int left = (int) (ord - state.ord); - while(left > 0) { - final BytesRef term = next(); - assert term != null; - left--; - } - - // always found - return SeekStatus.FOUND; - } - - @Override - public BytesRef term() { - return bytesReader.term; - } - - @Override - public long ord() { - return state.ord; - } - - @Override - public BytesRef next() throws IOException { - - if (seekPending) { - seekPending = false; - in.seek(state.filePointer); - } - - if (state.ord >= numTerms-1) { - return null; - } - - bytesReader.read(); - state.docFreq = in.readVInt(); - - // TODO: would be cleaner, but space-wasting, to - // simply record a bit into each index entry as to - // whether it's an index entry or not, rather than - // re-compute that information... or, possibly store - // a "how many terms until next index entry" in each - // index entry, but that'd require some tricky - // lookahead work when writing the index - postingsReader.readTerm(in, - fieldInfo, state, - fieldIndexReader.isIndexTerm(1+state.ord, state.docFreq, false)); - - state.ord++; - - return bytesReader.term; - } - - @Override - public int docFreq() { - return state.docFreq; - } - - @Override - public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { - DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); - assert docsEnum != null; - return docsEnum; - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { - if (fieldInfo.omitTermFreqAndPositions) { - return null; - } else { - return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse); - } - } - } - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,192 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Comparator; - -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.codecs.FieldsConsumer; -import org.apache.lucene.index.codecs.PostingsConsumer; -import org.apache.lucene.index.codecs.TermsConsumer; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.CodecUtil; - -/** - * Writes terms dict and interacts with docs/positions - * consumers to write the postings files. - * - * The [new] terms dict format is field-centric: each field - * has its own section in the file. Fields are written in - * UTF16 string comparison order. Within each field, each - * term's text is written in UTF16 string comparison order. - * @lucene.experimental - */ - -public class StandardTermsDictWriter extends FieldsConsumer { - - final static String CODEC_NAME = "STANDARD_TERMS_DICT"; - - // Initial format - public static final int VERSION_START = 0; - - public static final int VERSION_CURRENT = VERSION_START; - - private final DeltaBytesWriter termWriter; - - protected final IndexOutput out; - final StandardPostingsWriter postingsWriter; - final FieldInfos fieldInfos; - FieldInfo currentField; - private final StandardTermsIndexWriter termsIndexWriter; - private final List fields = new ArrayList(); - private final Comparator termComp; - - public StandardTermsDictWriter( - StandardTermsIndexWriter termsIndexWriter, - SegmentWriteState state, - StandardPostingsWriter postingsWriter, - Comparator termComp) throws IOException - { - final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.TERMS_EXTENSION); - this.termsIndexWriter = termsIndexWriter; - this.termComp = termComp; - out = state.directory.createOutput(termsFileName); - termsIndexWriter.setTermsOutput(out); - state.flushedFiles.add(termsFileName); - - fieldInfos = state.fieldInfos; - writeHeader(out); - termWriter = new DeltaBytesWriter(out); - currentField = null; - this.postingsWriter = postingsWriter; - - postingsWriter.start(out); // have consumer write its format/header - } - - protected void writeHeader(IndexOutput out) throws IOException { - // Count indexed fields up front - CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); - - out.writeLong(0); // leave space for end index pointer - } - - @Override - public TermsConsumer addField(FieldInfo field) { - assert currentField == null || currentField.name.compareTo(field.name) < 0; - currentField = field; - StandardTermsIndexWriter.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field); - TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter); - fields.add(terms); - return terms; - } - - @Override - public void close() throws IOException { - - try { - final int fieldCount = fields.size(); - - final long dirStart = out.getFilePointer(); - - out.writeInt(fieldCount); - for(int i=0;i getComparator() { - return termComp; - } - - @Override - public PostingsConsumer startTerm(BytesRef text) throws IOException { - postingsWriter.startTerm(); - return postingsWriter; - } - - @Override - public void finishTerm(BytesRef text, int numDocs) throws IOException { - - assert numDocs > 0; - - final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs); - - termWriter.write(text); - out.writeVInt(numDocs); - - postingsWriter.finishTerm(numDocs, isIndexTerm); - numTerms++; - } - - // Finishes all terms in this field - @Override - public void finish() throws IOException { - fieldIndexWriter.finish(); - } - } -} diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexReader.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.util.BytesRef; - -import java.io.IOException; -import java.util.Collection; - - -// TODO -// - allow for non-regular index intervals? eg with a -// long string of rare terms, you don't need such -// frequent indexing - -/** - * TermsDictReader interacts with an instance of this class - * to manage its terms index. The writer must accept - * indexed terms (many pairs of CharSequence text + long - * fileOffset), and then this reader must be able to - * retrieve the nearest index term to a provided term - * text. - * @lucene.experimental */ - -public abstract class StandardTermsIndexReader { - - static class TermsIndexResult { - long position; - final BytesRef term = new BytesRef(); - long offset; - }; - - public abstract class FieldReader { - /** Returns position of "largest" index term that's <= - * text. Returned TermsIndexResult may be reused - * across calls. This resets internal state, and - * expects that you'll then scan the file and - * sequentially call isIndexTerm for each term - * encountered. */ - public abstract void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException; - - public abstract void getIndexOffset(long ord, TermsIndexResult result) throws IOException; - - /** Call this sequentially for each term encoutered, - * after calling {@link #getIndexOffset}. */ - public abstract boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) throws IOException; - - /** Finds the next index term, after the specified - * ord. Returns true if one exists. */ - public abstract boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException; - } - - public abstract FieldReader getField(FieldInfo fieldInfo); - - public abstract void loadTermsIndex(int indexDivisor) throws IOException; - - public abstract void close() throws IOException; - - public abstract void getExtensions(Collection extensions); -} \ No newline at end of file diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.util.BytesRef; -import java.io.IOException; - -/** @lucene.experimental */ -public abstract class StandardTermsIndexWriter { - - public abstract void setTermsOutput(IndexOutput out); - - public abstract class FieldWriter { - public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException; - public abstract void finish() throws IOException; - } - - public abstract FieldWriter addField(FieldInfo fieldInfo); - - public abstract void close() throws IOException; -} \ No newline at end of file diff -r b896c1f47a25 lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java Wed Sep 15 18:41:40 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -package org.apache.lucene.index.codecs.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.index.DocsEnum; // for javadocs - -/** - * Holds all state required for {@link StandardPostingsReader} - * to produce a {@link DocsEnum} without re-seeking the - * terms dict. - * @lucene.experimental - */ - -public class TermState implements Cloneable { - public long ord; // ord for this term - public long filePointer; // fp into the terms dict primary file (_X.tis) - public int docFreq; // how many docs have this term - - public void copy(TermState other) { - ord = other.ord; - filePointer = other.filePointer; - docFreq = other.docFreq; - } - - @Override - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException cnse) { - // should not happen - throw new RuntimeException(cnse); - } - } - - @Override - public String toString() { - return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord; - } -} diff -r b896c1f47a25 lucene/src/test/org/apache/lucene/TestExternalCodecs.java --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Sep 16 05:41:06 2010 -0400 @@ -497,18 +497,18 @@ @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); + AbstractPostingsWriter docsWriter = new StandardPostingsWriter(state); // Terms that have <= freqCutoff number of docs are // "pulsed" (inlined): final int freqCutoff = 1; - StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); + AbstractPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); // Terms dict index - StandardTermsIndexWriter indexWriter; + AbstractTermsIndexWriter indexWriter; boolean success = false; try { - indexWriter = new SimpleStandardTermsIndexWriter(state); + indexWriter = new SimpleTermsIndexWriter(state); success = true; } finally { if (!success) { @@ -519,7 +519,7 @@ // Terms dict success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator); + FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator); success = true; return ret; } finally { @@ -536,19 +536,19 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); - StandardPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader); + AbstractPostingsReader docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize); + AbstractPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader); // Terms dict index reader - StandardTermsIndexReader indexReader; + AbstractTermsIndexReader indexReader; boolean success = false; try { - indexReader = new SimpleStandardTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - reverseUnicodeComparator); + indexReader = new SimpleTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + reverseUnicodeComparator); success = true; } finally { if (!success) { @@ -559,14 +559,14 @@ // Terms dict reader success = false; try { - FieldsProducer ret = new StandardTermsDictReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - pulsingReader, - state.readBufferSize, - reverseUnicodeComparator, - StandardCodec.TERMS_CACHE_SIZE); + FieldsProducer ret = new PrefixCodedTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + pulsingReader, + state.readBufferSize, + reverseUnicodeComparator, + StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; } finally { @@ -582,9 +582,9 @@ @Override public void files(Directory dir, SegmentInfo segmentInfo, Set files) throws IOException { - StandardPostingsReaderImpl.files(dir, segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + StandardPostingsReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override diff -r b896c1f47a25 lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java --- a/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java Thu Sep 16 05:41:06 2010 -0400 @@ -33,14 +33,14 @@ import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.SimpleTermsIndexReader; +import org.apache.lucene.index.codecs.SimpleTermsIndexWriter; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; +import org.apache.lucene.index.codecs.AbstractPostingsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.AbstractTermsIndexReader; +import org.apache.lucene.index.codecs.AbstractTermsIndexWriter; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.*; import org.apache.lucene.util.BytesRef; @@ -105,12 +105,12 @@ @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory()); + AbstractPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory()); boolean success = false; - StandardTermsIndexWriter indexWriter; + AbstractTermsIndexWriter indexWriter; try { - indexWriter = new SimpleStandardTermsIndexWriter(state); + indexWriter = new SimpleTermsIndexWriter(state); success = true; } finally { if (!success) { @@ -120,7 +120,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -136,19 +136,19 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - StandardPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, + AbstractPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize, new MockIntFactory()); - StandardTermsIndexReader indexReader; + AbstractTermsIndexReader indexReader; boolean success = false; try { - indexReader = new SimpleStandardTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + indexReader = new SimpleTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -158,14 +158,14 @@ success = false; try { - FieldsProducer ret = new StandardTermsDictReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), - StandardCodec.TERMS_CACHE_SIZE); + FieldsProducer ret = new PrefixCodedTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUnicodeComparator(), + StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; } finally { @@ -182,14 +182,14 @@ @Override public void files(Directory dir, SegmentInfo segmentInfo, Set files) { SepPostingsReaderImpl.files(segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override public void getExtensions(Set extensions) { SepPostingsWriterImpl.getExtensions(extensions); - StandardTermsDictReader.getExtensions(extensions); - SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + PrefixCodedTermsReader.getExtensions(extensions); + SimpleTermsIndexReader.getIndexExtensions(extensions); } } diff -r b896c1f47a25 lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java --- a/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java Thu Sep 16 05:41:06 2010 -0400 @@ -33,14 +33,14 @@ import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.SimpleTermsIndexReader; +import org.apache.lucene.index.codecs.SimpleTermsIndexWriter; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; +import org.apache.lucene.index.codecs.AbstractPostingsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.AbstractTermsIndexReader; +import org.apache.lucene.index.codecs.AbstractTermsIndexWriter; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -128,12 +128,12 @@ @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory()); + AbstractPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockIntFactory()); boolean success = false; - StandardTermsIndexWriter indexWriter; + AbstractTermsIndexWriter indexWriter; try { - indexWriter = new SimpleStandardTermsIndexWriter(state); + indexWriter = new SimpleTermsIndexWriter(state); success = true; } finally { if (!success) { @@ -143,7 +143,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -159,19 +159,19 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - StandardPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, + AbstractPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize, new MockIntFactory()); - StandardTermsIndexReader indexReader; + AbstractTermsIndexReader indexReader; boolean success = false; try { - indexReader = new SimpleStandardTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + indexReader = new SimpleTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -181,14 +181,14 @@ success = false; try { - FieldsProducer ret = new StandardTermsDictReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), - StandardCodec.TERMS_CACHE_SIZE); + FieldsProducer ret = new PrefixCodedTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUnicodeComparator(), + StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; } finally { @@ -205,14 +205,14 @@ @Override public void files(Directory dir, SegmentInfo segmentInfo, Set files) { SepPostingsReaderImpl.files(segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override public void getExtensions(Set extensions) { SepPostingsWriterImpl.getExtensions(extensions); - StandardTermsDictReader.getExtensions(extensions); - SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + PrefixCodedTermsReader.getExtensions(extensions); + SimpleTermsIndexReader.getIndexExtensions(extensions); } } diff -r b896c1f47a25 lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java --- a/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java Wed Sep 15 18:41:40 2010 +0000 +++ b/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java Thu Sep 16 05:41:06 2010 -0400 @@ -26,14 +26,14 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; -import org.apache.lucene.index.codecs.standard.StandardPostingsReader; -import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; -import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; -import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.index.codecs.SimpleTermsIndexReader; +import org.apache.lucene.index.codecs.SimpleTermsIndexWriter; +import org.apache.lucene.index.codecs.AbstractPostingsReader; +import org.apache.lucene.index.codecs.AbstractPostingsWriter; +import org.apache.lucene.index.codecs.PrefixCodedTermsReader; +import org.apache.lucene.index.codecs.PrefixCodedTermsWriter; +import org.apache.lucene.index.codecs.AbstractTermsIndexReader; +import org.apache.lucene.index.codecs.AbstractTermsIndexWriter; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; @@ -55,12 +55,12 @@ @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockSingleIntFactory()); + AbstractPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new MockSingleIntFactory()); boolean success = false; - StandardTermsIndexWriter indexWriter; + AbstractTermsIndexWriter indexWriter; try { - indexWriter = new SimpleStandardTermsIndexWriter(state); + indexWriter = new SimpleTermsIndexWriter(state); success = true; } finally { if (!success) { @@ -70,7 +70,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + FieldsConsumer ret = new PrefixCodedTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -87,16 +87,16 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - StandardPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize, new MockSingleIntFactory()); + AbstractPostingsReader postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize, new MockSingleIntFactory()); - StandardTermsIndexReader indexReader; + AbstractTermsIndexReader indexReader; boolean success = false; try { - indexReader = new SimpleStandardTermsIndexReader(state.dir, - state.fieldInfos, - state.segmentInfo.name, - state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUnicodeComparator()); + indexReader = new SimpleTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -106,14 +106,14 @@ success = false; try { - FieldsProducer ret = new StandardTermsDictReader(indexReader, - state.dir, - state.fieldInfos, - state.segmentInfo.name, - postingsReader, - state.readBufferSize, - BytesRef.getUTF8SortedAsUnicodeComparator(), - StandardCodec.TERMS_CACHE_SIZE); + FieldsProducer ret = new PrefixCodedTermsReader(indexReader, + state.dir, + state.fieldInfos, + state.segmentInfo.name, + postingsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUnicodeComparator(), + StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; } finally { @@ -130,8 +130,8 @@ @Override public void files(Directory dir, SegmentInfo segmentInfo, Set files) { SepPostingsReaderImpl.files(segmentInfo, files); - StandardTermsDictReader.files(dir, segmentInfo, files); - SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + PrefixCodedTermsReader.files(dir, segmentInfo, files); + SimpleTermsIndexReader.files(dir, segmentInfo, files); } @Override @@ -141,7 +141,7 @@ public static void getSepExtensions(Set extensions) { SepPostingsWriterImpl.getExtensions(extensions); - StandardTermsDictReader.getExtensions(extensions); - SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + PrefixCodedTermsReader.getExtensions(extensions); + SimpleTermsIndexReader.getIndexExtensions(extensions); } } \ No newline at end of file