Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (working copy) +++ lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (working copy) @@ -17,12 +17,15 @@ * limitations under the License. */ +import java.io.Closeable; import java.io.IOException; -import java.io.Closeable; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; /** * Extension of {@link PostingsConsumer} to support pluggable term dictionaries. @@ -39,7 +42,7 @@ // TODO: find a better name; this defines the API that the // terms dict impls use to talk to a postings impl. // TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer -public abstract class PostingsWriterBase extends PostingsConsumer implements Closeable { +public abstract class PostingsWriterBase implements Closeable { /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ @@ -51,19 +54,17 @@ * the provided {@code termsOut}. */ public abstract void init(IndexOutput termsOut) throws IOException; - /** Return a newly created empty TermState */ - public abstract BlockTermState newTermState() throws IOException; + /** Write all postings for one term; use the provided + * {@link TermsEnum} to pull a {@link DocsEnum} or {@link + * DocsAndPositionsEnum}. This method should not + * re-position the {@code TermsEnum}! It is already + * positioned on the term that should be written. This + * method must set the bit in the provided {@link + * FixedBitSet} for every docID written. If no docs + * were written, this method should return null, and the + * terms dict will skip the term. */ + public abstract BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException; - /** Start a new term. Note that a matching call to {@link - * #finishTerm(BlockTermState)} is done, only if the term has at least one - * document. */ - public abstract void startTerm() throws IOException; - - /** Finishes the current term. The provided {@link - * BlockTermState} contains the term's summary statistics, - * and will holds metadata from PBF when returned */ - public abstract void finishTerm(BlockTermState state) throws IOException; - /** * Encode metadata as long[] and byte[]. {@code absolute} controls whether * current term is delta encoded according to latest term. Index: lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy) +++ lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -24,13 +25,17 @@ import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; @@ -174,7 +179,7 @@ * @lucene.experimental */ -public class BlockTreeTermsWriter extends PushFieldsConsumer { +public class BlockTreeTermsWriter extends FieldsConsumer implements Closeable { /** Suggested default value for the {@code * minItemsInBlock} parameter to {@link @@ -228,12 +233,12 @@ private final IndexOutput out; private final IndexOutput indexOut; + final int maxDoc; final int minItemsInBlock; final int maxItemsInBlock; final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; - FieldInfo currentField; private static class FieldMetaData { public final FieldInfo fieldInfo; @@ -273,7 +278,6 @@ int maxItemsInBlock) throws IOException { - super(state); if (minItemsInBlock <= 1) { throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); } @@ -287,6 +291,8 @@ throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock); } + maxDoc = state.segmentInfo.getDocCount(); + final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); out = state.directory.createOutput(termsFileName, state.context); boolean success = false; @@ -303,7 +309,6 @@ indexOut = state.directory.createOutput(termsIndexFileName, state.context); writeIndexHeader(indexOut); - currentField = null; this.postingsWriter = postingsWriter; // segment = state.segmentName; @@ -338,16 +343,46 @@ private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException { indexOut.writeLong(dirStart); } - + @Override - public TermsConsumer addField(FieldInfo field) throws IOException { - //DEBUG = field.name.equals("id"); - //if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name); - assert currentField == null || currentField.name.compareTo(field.name) < 0; - currentField = field; - return new TermsWriter(field); + public void write(Fields fields) throws IOException { + + boolean success = false; + try { + String lastField = null; + for(String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(null); + + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + termsWriter.write(term, termsEnum); + } + + termsWriter.finish(); + } + success = true; + } finally { + if (success) { + IOUtils.close(this); + } else { + IOUtils.closeWhileHandlingException(this); + } + } } - + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { assert fp < (1L << 62); return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); @@ -488,13 +523,13 @@ final RAMOutputStream scratchBytes = new RAMOutputStream(); - class TermsWriter extends TermsConsumer { + class TermsWriter { private final FieldInfo fieldInfo; private final int longsSize; private long numTerms; + final FixedBitSet docsSeen; long sumTotalTermFreq; long sumDocFreq; - int docCount; long indexStartFP; // Used only to partition terms into the block tree; we @@ -1000,6 +1035,7 @@ TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; + docsSeen = new FixedBitSet(maxDoc); noOutputs = NoOutputs.getSingleton(); @@ -1017,42 +1053,27 @@ this.longsSize = postingsWriter.setField(fieldInfo); } - @Override - public PostingsConsumer startTerm(BytesRef text) throws IOException { - //if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment); - postingsWriter.startTerm(); - /* - if (fieldInfo.name.equals("id")) { - postingsWriter.termID = Integer.parseInt(text.utf8ToString()); - } else { - postingsWriter.termID = -1; - } - */ - return postingsWriter; - } - private final IntsRef scratchIntsRef = new IntsRef(); - @Override - public void finishTerm(BytesRef text, TermStats stats) throws IOException { + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum) throws IOException { - assert stats.docFreq > 0; - //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq); + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen); + if (state != null) { + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter; + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); - blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); - BlockTermState state = postingsWriter.newTermState(); - state.docFreq = stats.docFreq; - state.totalTermFreq = stats.totalTermFreq; - postingsWriter.finishTerm(state); - - PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state); - pending.add(term); - numTerms++; + PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state); + pending.add(term); + numTerms++; + } } // Finishes all terms in this field - @Override - public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + public void finish() throws IOException { if (numTerms > 0) { blockBuilder.finish(); @@ -1062,10 +1083,6 @@ assert root.prefix.length == 0; assert root.index.getEmptyOutput() != null; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - // Write FST to index indexStartFP = indexOut.getFilePointer(); root.index.save(indexOut); @@ -1085,12 +1102,12 @@ indexStartFP, sumTotalTermFreq, sumDocFreq, - docCount, + docsSeen.cardinality(), longsSize)); } else { assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; assert sumDocFreq == 0; - assert docCount == 0; + assert docsSeen.cardinality() == 0; } } Index: lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java (revision 1530744) +++ lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java (working copy) @@ -48,6 +48,7 @@ * * @lucene.experimental */ +// nocommit remove? public abstract class PushFieldsConsumer extends FieldsConsumer implements Closeable { final SegmentWriteState writeState; Index: lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (working copy) @@ -0,0 +1,212 @@ +package org.apache.lucene.codecs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; + +/** + * Extension of {@link PostingsConsumer} to support pluggable term dictionaries. + *

+ * This class contains additional hooks to interact with the provided + * term dictionaries such as {@link BlockTreeTermsWriter}. If you want + * to re-use an existing implementation and are only interested in + * customizing the format of the postings list, extend this class + * instead. + * + * @see PostingsReaderBase + * @lucene.experimental + */ +// TODO: find a better name; this defines the API that the +// terms dict impls use to talk to a postings impl. +// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer +public abstract class PushPostingsWriterBase extends PostingsWriterBase { + + // Reused in writeTerm + private DocsEnum docsEnum; + private DocsAndPositionsEnum posEnum; + private int enumFlags; + + protected FieldInfo fieldInfo; + protected IndexOptions indexOptions; + protected boolean writeFreqs; + protected boolean writePositions; + protected boolean writePayloads; + protected boolean writeOffsets; + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected PushPostingsWriterBase() { + } + + /** Called once after startup, before any terms have been + * added. Implementations typically write a header to + * the provided {@code termsOut}. */ + public abstract void init(IndexOutput termsOut) throws IOException; + + /** Return a newly created empty TermState */ + public abstract BlockTermState newTermState() throws IOException; + + /** Start a new term. Note that a matching call to {@link + * #finishTerm(BlockTermState)} is done, only if the term has at least one + * document. */ + public abstract void startTerm() throws IOException; + + /** Finishes the current term. The provided {@link + * BlockTermState} contains the term's summary statistics, + * and will holds metadata from PBF when returned */ + public abstract void finishTerm(BlockTermState state) throws IOException; + + /** + * Encode metadata as long[] and byte[]. {@code absolute} controls whether + * current term is delta encoded according to latest term. + * Usually elements in {@code longs} are file pointers, so each one always + * increases when a new term is consumed. {@code out} is used to write generic + * bytes, which are not monotonic. + * + * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. + * the pointer to postings list may not be defined for some terms but is defined + * for others, if it is designed to inline some postings data in term dictionary. + * In this case, the postings writer should always use the last value, so that each + * element in metadata long[] remains monotonic. + */ + public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + + /** + * Sets the current field for writing, and returns the + * fixed length of long[] metadata (which is fixed per + * field), called when the writing switches to another field. */ + // TODO: better name? + public int setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + indexOptions = fieldInfo.getIndexOptions(); + + writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + writePayloads = fieldInfo.hasPayloads(); + + if (writeFreqs == false) { + enumFlags = 0; + } else if (writePositions == false) { + enumFlags = DocsEnum.FLAG_FREQS; + } else if (writeOffsets == false) { + if (writePayloads) { + enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS; + } else { + enumFlags = 0; + } + } else { + if (writePayloads) { + enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS; + } else { + enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS; + } + } + + // nocommit awkward + return 0; + } + + @Override + public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException { + startTerm(); + if (writePositions == false) { + docsEnum = termsEnum.docs(null, docsEnum, enumFlags); + } else { + posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags); + docsEnum = posEnum; + } + assert docsEnum != null; + + int docFreq = 0; + long totalTermFreq = 0; + while (true) { + int docID = docsEnum.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + docFreq++; + docsSeen.set(docID); + int freq; + if (writeFreqs) { + freq = docsEnum.freq(); + totalTermFreq += freq; + } else { + // nocommit 1? + freq = -1; + } + startDoc(docID, freq); + + if (writePositions) { + for(int i=0;ifreq will be -1 when term frequencies are omitted + * for the field. */ + public abstract void startDoc(int docID, int freq) throws IOException; + + /** Add a new position & payload, and start/end offset. A + * null payload means no payload; a non-null payload with + * zero length also means no payload. Caller may reuse + * the {@link BytesRef} for the payload between calls + * (method must fully consume the payload). startOffset + * and endOffset will be -1 when offsets are not indexed. */ + public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException; + + /** Called when we are done adding positions & payloads + * for each doc. */ + public abstract void finishDoc() throws IOException; +} Property changes on: lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (revision 1530744) +++ lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (working copy) @@ -26,7 +26,6 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.index.DocsEnum; // javadocs import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs import org.apache.lucene.index.FieldInfos; // javadocs Index: lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (working copy) +++ lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (working copy) @@ -17,32 +17,28 @@ * limitations under the License. */ -import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; -import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; -import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; - import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.PushPostingsWriterBase; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TermState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; +import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; +import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; +import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; + /** * Concrete class that writes docId(maybe frq,pos,offset,payloads) list * with postings format. @@ -52,7 +48,7 @@ * @see Lucene41SkipWriter for details about skipping setting and postings layout. * @lucene.experimental */ -public final class Lucene41PostingsWriter extends PostingsWriterBase { +public final class Lucene41PostingsWriter extends PushPostingsWriterBase { /** * Expert: The maximum number of skip levels. Smaller values result in @@ -77,12 +73,6 @@ final static IntBlockTermState emptyState = new IntBlockTermState(); IntBlockTermState lastState; - // How current field indexes postings: - private boolean fieldHasFreqs; - private boolean fieldHasPositions; - private boolean fieldHasOffsets; - private boolean fieldHasPayloads; - // Holds starting file pointers for current term: private long docStartFP; private long posStartFP; @@ -241,15 +231,11 @@ @Override public int setField(FieldInfo fieldInfo) { - IndexOptions indexOptions = fieldInfo.getIndexOptions(); - fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - fieldHasPayloads = fieldInfo.hasPayloads(); - skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); + super.setField(fieldInfo); + skipWriter.setField(writePositions, writeOffsets, writePayloads); lastState = emptyState; - if (fieldHasPositions) { - if (fieldHasPayloads || fieldHasOffsets) { + if (writePositions) { + if (writePayloads || writeOffsets) { return 3; // doc + pos + pay FP } else { return 2; // doc + pos FP @@ -262,9 +248,9 @@ @Override public void startTerm() { docStartFP = docOut.getFilePointer(); - if (fieldHasPositions) { + if (writePositions) { posStartFP = posOut.getFilePointer(); - if (fieldHasPayloads || fieldHasOffsets) { + if (writePayloads || writeOffsets) { payStartFP = payOut.getFilePointer(); } } @@ -301,7 +287,7 @@ // if (DEBUG) { // System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta); // } - if (fieldHasFreqs) { + if (writeFreqs) { freqBuffer[docBufferUpto] = termDocFreq; } docBufferUpto++; @@ -312,7 +298,7 @@ // System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer()); // } forUtil.writeBlock(docDeltaBuffer, encoded, docOut); - if (fieldHasFreqs) { + if (writeFreqs) { // if (DEBUG) { // System.out.println(" write freq block @ fp=" + docOut.getFilePointer()); // } @@ -329,14 +315,13 @@ lastStartOffset = 0; } - /** Add a new position & payload */ @Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { // if (DEBUG) { - // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); + // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (writePayloads ? " payloadByteUpto=" + payloadByteUpto: "")); // } posDeltaBuffer[posBufferUpto] = position - lastPosition; - if (fieldHasPayloads) { + if (writePayloads) { if (payload == null || payload.length == 0) { // no payload payloadLengthBuffer[posBufferUpto] = 0; @@ -350,7 +335,7 @@ } } - if (fieldHasOffsets) { + if (writeOffsets) { assert startOffset >= lastStartOffset; assert endOffset >= startOffset; offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; @@ -366,13 +351,13 @@ // } forUtil.writeBlock(posDeltaBuffer, encoded, posOut); - if (fieldHasPayloads) { + if (writePayloads) { forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); payOut.writeVInt(payloadByteUpto); payOut.writeBytes(payloadBytes, 0, payloadByteUpto); payloadByteUpto = 0; } - if (fieldHasOffsets) { + if (writeOffsets) { forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); } @@ -433,7 +418,7 @@ for(int i=0;i 0) { - // System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posStartFP=" + posStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets); + // System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posStartFP=" + posStartFP + " hasPayloads=" + writePayloads + " hasOffsets=" + writeOffsets); // } // } @@ -474,7 +459,7 @@ int payloadBytesReadUpto = 0; for(int i=0;i 0) { - if (pos.payload == null) { - pos.payload = BytesRef.deepCopyOf(payload); - } else { - pos.payload.copyBytes(payload); + posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags); + assert posEnum != null; + while (posCount <= maxPositions) { + if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) { + break; } - } else if (pos.payload != null) { - pos.payload.length = 0; + posCount += posEnum.freq(); } } - } - @Override - public void finishDoc() throws IOException { - // if (DEBUG) System.out.println("PW finishDoc"); - if (pendingCount == -1) { - wrappedPostingsWriter.finishDoc(); + if (posCount == 0) { + // All docs were deleted + return null; } - } - private final RAMOutputStream buffer = new RAMOutputStream(); + // Second pass: write postings + if (posCount > maxPositions) { + // Too many positions; do not pulse. Just lset + // wrapped postingsWriter encode the postings: - // private int baseDocID; + PulsingTermState state = new PulsingTermState(); + state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen); + state.docFreq = state.wrappedState.docFreq; + state.totalTermFreq = state.wrappedState.totalTermFreq; + return state; + } else { + // Pulsed: + if (fieldHasPositions == false) { + docsEnum = termsEnum.docs(null, docsEnum, enumFlags); + } else { + posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags); + docsEnum = posEnum; + } + assert docsEnum != null; - /** Called when we are done adding docs to this term */ - @Override - public void finishTerm(BlockTermState _state) throws IOException { - PulsingTermState state = (PulsingTermState) _state; - - // if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size()); - - assert pendingCount > 0 || pendingCount == -1; - - if (pendingCount == -1) { - state.wrappedState.docFreq = state.docFreq; - state.wrappedState.totalTermFreq = state.totalTermFreq; - state.bytes = null; - wrappedPostingsWriter.finishTerm(state.wrappedState); - } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: @@ -287,98 +202,138 @@ // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so - if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { - int lastDocID = 0; - int pendingIDX = 0; - int lastPayloadLength = -1; - int lastOffsetLength = -1; - while(pendingIDX < pendingCount) { - final Position doc = pending[pendingIDX]; + int lastDocID = 0; + int lastPayloadLength = -1; + int lastOffsetLength = -1; - final int delta = doc.docID - lastDocID; - lastDocID = doc.docID; + int docFreq = 0; + long totalTermFreq = 0; + while (true) { + int docID = docsEnum.nextDoc(); + if (docID == DocsEnum.NO_MORE_DOCS) { + break; + } + docsSeen.set(docID); - // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); + int delta = docID - lastDocID; + lastDocID = docID; - if (doc.termFreq == 1) { - buffer.writeVInt((delta<<1)|1); + docFreq++; + + if (fieldHasFreqs) { + int freq = docsEnum.freq(); + totalTermFreq += freq; + + if (freq == 1) { + buffer.writeVInt((delta << 1) | 1); } else { - buffer.writeVInt(delta<<1); - buffer.writeVInt(doc.termFreq); + buffer.writeVInt(delta << 1); + buffer.writeVInt(freq); } - int lastPos = 0; - int lastOffset = 0; - for(int posIDX=0;posIDX 0) { + assert fieldHasPayloads; + assert payload != null; + buffer.writeBytes(payload.bytes, payload.offset, payload.length); } - lastOffset = pos.startOffset; - lastOffsetLength = offsetLength; } - - if (payloadLength > 0) { - assert storePayloads; - buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length); - } } + } else { + buffer.writeVInt(delta); } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { - int lastDocID = 0; - for(int posIDX=0;posIDX= 0) { throw new UnsupportedOperationException("this codec cannot index offsets"); } skipListWriter.setIndexOptions(indexOptions); - storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads(); lastPayloadFP = 0; lastSkipFP = 0; lastState = setEmptyState(); @@ -233,7 +228,7 @@ // TODO: -- awkward we have to make these two // separate calls to skipper //System.out.println(" buffer skip lastDocID=" + lastDocID); - skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength); skipListWriter.bufferSkip(df); } @@ -254,7 +249,7 @@ assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it) lastPosition = position; - if (storePayloads) { + if (writePayloads) { final int payloadLength = payload == null ? 0 : payload.length; if (payloadLength != lastPayloadLength) { lastPayloadLength = payloadLength; @@ -344,7 +339,7 @@ if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { lastState.posIndex.copyFrom(state.posIndex, false); lastState.posIndex.write(out, absolute); - if (storePayloads) { + if (writePayloads) { if (absolute) { out.writeVLong(state.payloadFP); } else { Index: lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (working copy) @@ -17,26 +17,29 @@ * limitations under the License. */ +import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.PostingsConsumer; +import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.PushFieldsConsumer; import org.apache.lucene.codecs.TermStats; -import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -52,7 +55,7 @@ * @lucene.experimental */ -public class BlockTermsWriter extends PushFieldsConsumer { +public class BlockTermsWriter extends FieldsConsumer implements Closeable { final static String CODEC_NAME = "BLOCK_TERMS_DICT"; @@ -70,6 +73,7 @@ final FieldInfos fieldInfos; FieldInfo currentField; private final TermsIndexWriterBase termsIndexWriter; + private final int maxDoc; private static class FieldMetaData { public final FieldInfo fieldInfo; @@ -99,9 +103,9 @@ public BlockTermsWriter(TermsIndexWriterBase termsIndexWriter, SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - super(state); final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); this.termsIndexWriter = termsIndexWriter; + maxDoc = state.segmentInfo.getDocCount(); out = state.directory.createOutput(termsFileName, state.context); boolean success = false; try { @@ -127,7 +131,43 @@ } @Override - public TermsConsumer addField(FieldInfo field) throws IOException { + public void write(Fields fields) throws IOException { + + boolean success = false; + try { + for(String field : fields) { + + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(null); + + TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field)); + + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + termsWriter.write(term, termsEnum); + } + + termsWriter.finish(); + } + success = true; + } finally { + if (success) { + IOUtils.close(this); + } else { + IOUtils.closeWhileHandlingException(this); + } + } + } + + private TermsWriter addField(FieldInfo field) throws IOException { //System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name); assert currentField == null || currentField.name.compareTo(field.name) < 0; currentField = field; @@ -135,7 +175,6 @@ return new TermsWriter(fieldIndexWriter, field, postingsWriter); } - @Override public void close() throws IOException { try { final long dirStart = out.getFilePointer(); @@ -169,12 +208,13 @@ public BlockTermState state; } - class TermsWriter extends TermsConsumer { + class TermsWriter { private final FieldInfo fieldInfo; private final PostingsWriterBase postingsWriter; private final long termsStartPointer; private long numTerms; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; + private final FixedBitSet docsSeen; long sumTotalTermFreq; long sumDocFreq; int docCount; @@ -191,6 +231,7 @@ { this.fieldInfo = fieldInfo; this.fieldIndexWriter = fieldIndexWriter; + this.docsSeen = new FixedBitSet(maxDoc); pendingTerms = new TermEntry[32]; for(int i=0;i fields = new ArrayList(); IndexOutput blockOut = null; IndexOutput indexOut = null; public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - super(state); final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION); this.postingsWriter = postingsWriter; this.fieldInfos = state.fieldInfos; + this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; try { @@ -180,11 +182,41 @@ } @Override - public TermsConsumer addField(FieldInfo field) throws IOException { - return new TermsWriter(field); + public void write(Fields fields) throws IOException { + try { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + TermsEnum termsEnum = terms.iterator(null); + TermsWriter termsWriter = new TermsWriter(fieldInfo); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + FixedBitSet docsSeen = new FixedBitSet(maxDoc); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen); + if (termState != null) { + termsWriter.finishTerm(term, termState); + sumTotalTermFreq += termState.totalTermFreq; + sumDocFreq += termState.docFreq; + } + } + + termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); + } + } finally { + close(); + } } - @Override public void close() throws IOException { IOException ioe = null; try { @@ -247,7 +279,7 @@ public RAMOutputStream metaBytesOut; } - final class TermsWriter extends TermsConsumer { + final class TermsWriter { private final Builder builder; private final PositiveIntOutputs outputs; private final FieldInfo fieldInfo; @@ -284,34 +316,23 @@ this.lastMetaBytesFP = 0; } - @Override - public PostingsConsumer startTerm(BytesRef text) throws IOException { - postingsWriter.startTerm(); - return postingsWriter; - } - - @Override - public void finishTerm(BytesRef text, TermStats stats) throws IOException { + public void finishTerm(BytesRef text, BlockTermState state) throws IOException { if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) { bufferSkip(); } // write term meta data into fst final long longs[] = new long[longsSize]; - final long delta = stats.totalTermFreq - stats.docFreq; - if (stats.totalTermFreq > 0) { + final long delta = state.totalTermFreq - state.docFreq; + if (state.totalTermFreq > 0) { if (delta == 0) { - statsOut.writeVInt(stats.docFreq<<1|1); + statsOut.writeVInt(state.docFreq<<1|1); } else { - statsOut.writeVInt(stats.docFreq<<1|0); - statsOut.writeVLong(stats.totalTermFreq-stats.docFreq); + statsOut.writeVInt(state.docFreq<<1|0); + statsOut.writeVLong(state.totalTermFreq-state.docFreq); } } else { - statsOut.writeVInt(stats.docFreq); + statsOut.writeVInt(state.docFreq); } - BlockTermState state = postingsWriter.newTermState(); - state.docFreq = stats.docFreq; - state.totalTermFreq = stats.totalTermFreq; - postingsWriter.finishTerm(state); postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true); for (int i = 0; i < longsSize; i++) { metaLongsOut.writeVLong(longs[i] - lastLongs[i]); @@ -325,7 +346,6 @@ lastMetaBytesFP = metaBytesOut.getFilePointer(); } - @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (numTerms > 0) { final FieldMetaData metadata = new FieldMetaData(); Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java (working copy) @@ -25,7 +25,6 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.LongsRef; /** * An FST {@link Outputs} implementation for @@ -89,6 +88,11 @@ } @Override + public String toString() { + return "FSTTermOutputs$TermData longs=" + Arrays.toString(longs) + " bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq; + } + + @Override public boolean equals(Object other_) { if (other_ == this) { return true; @@ -221,6 +225,7 @@ @Override public void write(TermData data, DataOutput out) throws IOException { + assert hasPos || data.totalTermFreq == -1; int bit0 = allZero(data.longs) ? 0 : 1; int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1; int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2; Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java (working copy) @@ -25,9 +25,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java (working copy) @@ -25,9 +25,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java (working copy) @@ -25,12 +25,9 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; +import org.apache.lucene.codecs.pulsing.PulsingPostingsReader; import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter; -import org.apache.lucene.codecs.pulsing.PulsingPostingsReader; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (working copy) @@ -23,20 +23,21 @@ import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.PostingsConsumer; +import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.PushFieldsConsumer; -import org.apache.lucene.codecs.TermStats; -import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; @@ -119,7 +120,7 @@ * @lucene.experimental */ -public class FSTTermsWriter extends PushFieldsConsumer { +public class FSTTermsWriter extends FieldsConsumer { static final String TERMS_EXTENSION = "tmp"; static final String TERMS_CODEC_NAME = "FST_TERMS_DICT"; public static final int TERMS_VERSION_START = 0; @@ -128,15 +129,16 @@ final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; final IndexOutput out; + final int maxDoc; final List fields = new ArrayList(); public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - super(state); final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); this.postingsWriter = postingsWriter; this.fieldInfos = state.fieldInfos; this.out = state.directory.createOutput(termsFileName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; try { @@ -149,19 +151,53 @@ } } } + private void writeHeader(IndexOutput out) throws IOException { CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT); } + private void writeTrailer(IndexOutput out, long dirStart) throws IOException { out.writeLong(dirStart); } @Override - public TermsConsumer addField(FieldInfo field) throws IOException { - return new TermsWriter(field); + public void write(Fields fields) throws IOException { + try { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + TermsEnum termsEnum = terms.iterator(null); + TermsWriter termsWriter = termsWriter = new TermsWriter(fieldInfo); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + FixedBitSet docsSeen = new FixedBitSet(maxDoc); + + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen); + if (termState != null) { + termsWriter.finishTerm(term, termState); + sumTotalTermFreq += termState.totalTermFreq; + sumDocFreq += termState.docFreq; + } + } + + termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); + } + } finally { + close(); + } } - @Override public void close() throws IOException { IOException ioe = null; try { @@ -208,7 +244,7 @@ } } - final class TermsWriter extends TermsConsumer { + final class TermsWriter { private final Builder builder; private final FSTTermOutputs outputs; private final FieldInfo fieldInfo; @@ -226,22 +262,13 @@ this.builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); } - @Override - public PostingsConsumer startTerm(BytesRef text) throws IOException { - postingsWriter.startTerm(); - return postingsWriter; - } - - @Override - public void finishTerm(BytesRef text, TermStats stats) throws IOException { + public void finishTerm(BytesRef text, BlockTermState state) throws IOException { // write term meta data into fst - final BlockTermState state = postingsWriter.newTermState(); final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData(); meta.longs = new long[longsSize]; meta.bytes = null; - meta.docFreq = state.docFreq = stats.docFreq; - meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq; - postingsWriter.finishTerm(state); + meta.docFreq = state.docFreq; + meta.totalTermFreq = state.totalTermFreq; postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true); final int bytesSize = (int)metaWriter.getFilePointer(); if (bytesSize > 0) { @@ -253,7 +280,6 @@ numTerms++; } - @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { // save FST dict if (numTerms > 0) { Index: lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (revision 1530744) +++ lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (working copy) @@ -28,15 +28,12 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsConsumer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PushFieldsConsumer; -import org.apache.lucene.codecs.TermStats; -import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.codecs.bloom.FuzzySet.ContainsResult; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -118,9 +115,7 @@ * "blm" file. This PostingsFormat delegates to a choice of delegate * PostingsFormat for encoding all other postings data. This choice of * constructor defaults to the {@link DefaultBloomFilterFactory} for - * configuring per-field BloomFilters. Note that the - * wrapped PostingsFormat must use a {@link PushFieldsConsumer} - * for writing. + * configuring per-field BloomFilters. * * @param delegatePostingsFormat * The PostingsFormat that records all the non-bloom filter data i.e. @@ -144,11 +139,7 @@ + " has been constructed without a choice of PostingsFormat"); } FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state); - if (!(fieldsConsumer instanceof PushFieldsConsumer)) { - throw new UnsupportedOperationException("Wrapped PostingsFormat must return a PushFieldsConsumer"); - } - return new BloomFilteredFieldsConsumer( - (PushFieldsConsumer) fieldsConsumer, state); + return new BloomFilteredFieldsConsumer(fieldsConsumer, state); } @Override @@ -310,7 +301,7 @@ this.delegateTermsEnum = null; } - private final TermsEnum delegate() throws IOException { + private TermsEnum delegate() throws IOException { if (delegateTermsEnum == null) { /* pull the iterator only if we really need it - * this can be a relativly heavy operation depending on the @@ -322,12 +313,12 @@ } @Override - public final BytesRef next() throws IOException { + public BytesRef next() throws IOException { return delegate().next(); } @Override - public final boolean seekExact(BytesRef text) + public boolean seekExact(BytesRef text) throws IOException { // The magical fail-fast speed up that is the entire point of all of // this code - save a disk seek if there is a match on an in-memory @@ -341,33 +332,33 @@ } @Override - public final SeekStatus seekCeil(BytesRef text) + public SeekStatus seekCeil(BytesRef text) throws IOException { return delegate().seekCeil(text); } @Override - public final void seekExact(long ord) throws IOException { + public void seekExact(long ord) throws IOException { delegate().seekExact(ord); } @Override - public final BytesRef term() throws IOException { + public BytesRef term() throws IOException { return delegate().term(); } @Override - public final long ord() throws IOException { + public long ord() throws IOException { return delegate().ord(); } @Override - public final int docFreq() throws IOException { + public int docFreq() throws IOException { return delegate().docFreq(); } @Override - public final long totalTermFreq() throws IOException { + public long totalTermFreq() throws IOException { return delegate().totalTermFreq(); } @@ -396,35 +387,60 @@ } } - class BloomFilteredFieldsConsumer extends PushFieldsConsumer { - private PushFieldsConsumer delegateFieldsConsumer; + class BloomFilteredFieldsConsumer extends FieldsConsumer { + private FieldsConsumer delegateFieldsConsumer; private Map bloomFilters = new HashMap(); private SegmentWriteState state; - public BloomFilteredFieldsConsumer(PushFieldsConsumer fieldsConsumer, + public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state) { - super(state); this.delegateFieldsConsumer = fieldsConsumer; this.state = state; } - + @Override - public TermsConsumer addField(FieldInfo field) throws IOException { - FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field); - if (bloomFilter != null) { - assert bloomFilters.containsKey(field) == false; - bloomFilters.put(field, bloomFilter); - return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field), bloomFilter); - } else { - // No, use the unfiltered fieldsConsumer - we are not interested in - // recording any term Bitsets. - return delegateFieldsConsumer.addField(field); + public void write(Fields fields) throws IOException { + try { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + TermsEnum termsEnum = terms.iterator(null); + + FuzzySet bloomFilter = null; + + DocsEnum docsEnum = null; + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (bloomFilter == null) { + bloomFilter = bloomFilterFactory.getSetForField(state, fieldInfo); + if (bloomFilter == null) { + // Field not bloom'd + break; + } + assert bloomFilters.containsKey(field) == false; + bloomFilters.put(fieldInfo, bloomFilter); + } + // Make sure there's at least one doc for this term: + docsEnum = termsEnum.docs(null, docsEnum, 0); + if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) { + bloomFilter.addValue(term); + } + } + } + } finally { + close(); } + + delegateFieldsConsumer.write(fields); } - - @Override + public void close() throws IOException { - delegateFieldsConsumer.close(); // Now we are done accumulating values for these fields List> nonSaturatedBlooms = new ArrayList>(); @@ -470,37 +486,5 @@ } rightSizedSet.serialize(bloomOutput); } - } - - class WrappedTermsConsumer extends TermsConsumer { - private TermsConsumer delegateTermsConsumer; - private FuzzySet bloomFilter; - - public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) { - this.delegateTermsConsumer = termsConsumer; - this.bloomFilter = bloomFilter; - } - - @Override - public PostingsConsumer startTerm(BytesRef text) throws IOException { - return delegateTermsConsumer.startTerm(text); - } - - @Override - public void finishTerm(BytesRef text, TermStats stats) throws IOException { - - // Record this term in our BloomFilter - if (stats.docFreq > 0) { - bloomFilter.addValue(text); - } - delegateTermsConsumer.finishTerm(text, stats); - } - - @Override - public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) - throws IOException { - delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount); - } - } } Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java (working copy) @@ -37,8 +37,8 @@ import org.apache.lucene.codecs.sep.IntStreamFactory; import org.apache.lucene.codecs.sep.SepPostingsReader; import org.apache.lucene.codecs.sep.SepPostingsWriter; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java (working copy) @@ -37,8 +37,8 @@ import org.apache.lucene.codecs.sep.IntStreamFactory; import org.apache.lucene.codecs.sep.SepPostingsReader; import org.apache.lucene.codecs.sep.SepPostingsWriter; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; Index: lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java (working copy) @@ -1,13 +1,5 @@ package org.apache.lucene.codecs.lucene40; -import java.io.IOException; - -import org.apache.lucene.codecs.BlockTreeTermsWriter; -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.LuceneTestCase; - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -25,6 +17,14 @@ * limitations under the License. */ +import java.io.IOException; + +import org.apache.lucene.codecs.BlockTreeTermsWriter; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.LuceneTestCase; + /** * Read-write version of {@link Lucene40PostingsFormat} for testing. */ Index: lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (working copy) @@ -21,22 +21,18 @@ * index file format */ import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.PushPostingsWriterBase; import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.DocsEnum; // javadocs import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -46,7 +42,7 @@ * @see Lucene40PostingsFormat * @lucene.experimental */ -public final class Lucene40PostingsWriter extends PostingsWriterBase { +public final class Lucene40PostingsWriter extends PushPostingsWriterBase { final IndexOutput freqOut; final IndexOutput proxOut; @@ -70,13 +66,9 @@ final int maxSkipLevels = 10; final int totalNumDocs; - IndexOptions indexOptions; - boolean storePayloads; - boolean storeOffsets; // Starts a new term long freqStart; long proxStart; - FieldInfo fieldInfo; int lastPayloadLength; int lastOffsetLength; int lastPosition; @@ -150,7 +142,6 @@ return new StandardTermState(); } - @Override public void startTerm() { freqStart = freqOut.getFilePointer(); @@ -169,6 +160,7 @@ // our parent calls setField whenever the field changes @Override public int setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); //System.out.println("SPW: setField"); /* if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) { @@ -177,11 +169,7 @@ DEBUG = false; } */ - this.fieldInfo = fieldInfo; - indexOptions = fieldInfo.getIndexOptions(); - - storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - storePayloads = fieldInfo.hasPayloads(); + lastState = emptyState; //System.out.println(" set init blockFreqStart=" + freqStart); //System.out.println(" set init blockProxStart=" + proxStart); @@ -190,7 +178,7 @@ int lastDocID; int df; - + @Override public void startDoc(int docID, int termDocFreq) throws IOException { // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); @@ -202,7 +190,7 @@ } if ((++df % skipInterval) == 0) { - skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength); + skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength, writeOffsets, lastOffsetLength); skipListWriter.bufferSkip(df); } @@ -237,7 +225,7 @@ int payloadLength = 0; - if (storePayloads) { + if (writePayloads) { payloadLength = payload == null ? 0 : payload.length; if (payloadLength != lastPayloadLength) { @@ -251,7 +239,7 @@ proxOut.writeVInt(delta); } - if (storeOffsets) { + if (writeOffsets) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - lastOffset; @@ -285,7 +273,7 @@ /** Called when we are done adding docs to this term */ @Override public void finishTerm(BlockTermState _state) throws IOException { - StandardTermState state = (StandardTermState)_state; + StandardTermState state = (StandardTermState) _state; // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart); assert state.docFreq > 0; Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java (working copy) @@ -40,6 +40,10 @@ import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader; import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter; +import org.apache.lucene.codecs.memory.FSTOrdTermsReader; +import org.apache.lucene.codecs.memory.FSTOrdTermsWriter; +import org.apache.lucene.codecs.memory.FSTTermsReader; +import org.apache.lucene.codecs.memory.FSTTermsWriter; import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; import org.apache.lucene.codecs.mocksep.MockSingleIntFactory; @@ -50,10 +54,6 @@ import org.apache.lucene.codecs.sep.IntStreamFactory; import org.apache.lucene.codecs.sep.SepPostingsReader; import org.apache.lucene.codecs.sep.SepPostingsWriter; -import org.apache.lucene.codecs.memory.FSTTermsWriter; -import org.apache.lucene.codecs.memory.FSTTermsReader; -import org.apache.lucene.codecs.memory.FSTOrdTermsWriter; -import org.apache.lucene.codecs.memory.FSTOrdTermsReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; Index: lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (working copy) @@ -163,6 +163,7 @@ boolean hasFreqs = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS) >= 0; boolean hasPositions = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; boolean hasOffsets = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + boolean hasPayloads = terms.hasPayloads(); assert hasPositions == terms.hasPositions(); assert hasOffsets == terms.hasOffsets(); @@ -179,14 +180,16 @@ lastTerm.copyBytes(term); } + int flags = 0; if (hasPositions == false) { - int flags = 0; if (hasFreqs) { flags = flags | DocsEnum.FLAG_FREQS; } docsEnum = termsEnum.docs(null, docsEnum, flags); } else { - int flags = DocsAndPositionsEnum.FLAG_PAYLOADS; + if (hasPayloads) { + flags |= DocsAndPositionsEnum.FLAG_PAYLOADS; + } if (hasOffsets) { flags = flags | DocsAndPositionsEnum.FLAG_OFFSETS; } @@ -194,6 +197,8 @@ docsEnum = posEnum; } + assert docsEnum != null : "termsEnum=" + termsEnum + " hasPositions=" + hasPositions; + int lastDocID = -1; while(true) { @@ -212,13 +217,13 @@ int lastStartOffset = -1; for(int i=0;i lastPos; + assert pos >= lastPos: "pos=" + pos + " vs lastPos=" + lastPos + " i=" + i + " freq=" + freq; lastPos = pos; if (hasOffsets) { int startOffset = posEnum.startOffset(); int endOffset = posEnum.endOffset(); - assert endOffset > startOffset; + assert endOffset >= startOffset; assert startOffset >= lastStartOffset; lastStartOffset = startOffset; } Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java (working copy) @@ -32,8 +32,8 @@ import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.sep.SepPostingsReader; import org.apache.lucene.codecs.sep.SepPostingsWriter; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.util.BytesRef; /** Index: lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (revision 1530744) +++ lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (working copy) @@ -538,7 +538,7 @@ @Override public boolean hasPayloads() { - return fieldInfo.hasPayloads(); + return allowPayloads && fieldInfo.hasPayloads(); } } @@ -628,15 +628,12 @@ throw new IllegalArgumentException("liveDocs must be null"); } if (maxAllowed.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { - System.out.println("no: max"); return null; } if ((flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0 && maxAllowed.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) { - System.out.println("no: offsets"); return null; } if ((flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0 && allowPayloads == false) { - System.out.println("no: payloads"); return null; } return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads);