Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (working copy)
+++ lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (working copy)
@@ -17,12 +17,15 @@
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
-import java.io.Closeable;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
/**
* Extension of {@link PostingsConsumer} to support pluggable term dictionaries.
@@ -39,7 +42,7 @@
// TODO: find a better name; this defines the API that the
// terms dict impls use to talk to a postings impl.
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
-public abstract class PostingsWriterBase extends PostingsConsumer implements Closeable {
+public abstract class PostingsWriterBase implements Closeable {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
@@ -51,19 +54,17 @@
* the provided {@code termsOut}. */
public abstract void init(IndexOutput termsOut) throws IOException;
- /** Return a newly created empty TermState */
- public abstract BlockTermState newTermState() throws IOException;
+ /** Write all postings for one term; use the provided
+ * {@link TermsEnum} to pull a {@link DocsEnum} or {@link
+ * DocsAndPositionsEnum}. This method should not
+ * re-position the {@code TermsEnum}! It is already
+ * positioned on the term that should be written. This
+ * method must set the bit in the provided {@link
+ * FixedBitSet} for every docID written. If no docs
+ * were written, this method should return null, and the
+ * terms dict will skip the term. */
+ public abstract BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException;
- /** Start a new term. Note that a matching call to {@link
- * #finishTerm(BlockTermState)} is done, only if the term has at least one
- * document. */
- public abstract void startTerm() throws IOException;
-
- /** Finishes the current term. The provided {@link
- * BlockTermState} contains the term's summary statistics,
- * and will holds metadata from PBF when returned */
- public abstract void finishTerm(BlockTermState state) throws IOException;
-
/**
* Encode metadata as long[] and byte[]. {@code absolute} controls whether
* current term is delta encoded according to latest term.
Index: lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy)
+++ lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy)
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -24,13 +25,17 @@
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@@ -174,7 +179,7 @@
* @lucene.experimental
*/
-public class BlockTreeTermsWriter extends PushFieldsConsumer {
+public class BlockTreeTermsWriter extends FieldsConsumer implements Closeable {
/** Suggested default value for the {@code
* minItemsInBlock} parameter to {@link
@@ -228,12 +233,12 @@
private final IndexOutput out;
private final IndexOutput indexOut;
+ final int maxDoc;
final int minItemsInBlock;
final int maxItemsInBlock;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
- FieldInfo currentField;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -273,7 +278,6 @@
int maxItemsInBlock)
throws IOException
{
- super(state);
if (minItemsInBlock <= 1) {
throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
}
@@ -287,6 +291,8 @@
throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
}
+ maxDoc = state.segmentInfo.getDocCount();
+
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
@@ -303,7 +309,6 @@
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
writeIndexHeader(indexOut);
- currentField = null;
this.postingsWriter = postingsWriter;
// segment = state.segmentName;
@@ -338,16 +343,46 @@
private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
indexOut.writeLong(dirStart);
}
-
+
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- //DEBUG = field.name.equals("id");
- //if (DEBUG) System.out.println("\nBTTW.addField seg=" + segment + " field=" + field.name);
- assert currentField == null || currentField.name.compareTo(field.name) < 0;
- currentField = field;
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+
+ boolean success = false;
+ try {
+ String lastField = null;
+ for(String field : fields) {
+ assert lastField == null || lastField.compareTo(field) < 0;
+ lastField = field;
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
}
-
+
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
@@ -488,13 +523,13 @@
final RAMOutputStream scratchBytes = new RAMOutputStream();
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final int longsSize;
private long numTerms;
+ final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
- int docCount;
long indexStartFP;
// Used only to partition terms into the block tree; we
@@ -1000,6 +1035,7 @@
TermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
+ docsSeen = new FixedBitSet(maxDoc);
noOutputs = NoOutputs.getSingleton();
@@ -1017,42 +1053,27 @@
this.longsSize = postingsWriter.setField(fieldInfo);
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- //if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment);
- postingsWriter.startTerm();
- /*
- if (fieldInfo.name.equals("id")) {
- postingsWriter.termID = Integer.parseInt(text.utf8ToString());
- } else {
- postingsWriter.termID = -1;
- }
- */
- return postingsWriter;
- }
-
private final IntsRef scratchIntsRef = new IntsRef();
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ /** Writes one term's worth of postings. */
+ public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
- assert stats.docFreq > 0;
- //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state != null) {
+ assert state.docFreq != 0;
+ assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
+ blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
- blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
- BlockTermState state = postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
-
- PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
- pending.add(term);
- numTerms++;
+ PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+ pending.add(term);
+ numTerms++;
+ }
}
// Finishes all terms in this field
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+ public void finish() throws IOException {
if (numTerms > 0) {
blockBuilder.finish();
@@ -1062,10 +1083,6 @@
assert root.prefix.length == 0;
assert root.index.getEmptyOutput() != null;
- this.sumTotalTermFreq = sumTotalTermFreq;
- this.sumDocFreq = sumDocFreq;
- this.docCount = docCount;
-
// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
@@ -1085,12 +1102,12 @@
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
- docCount,
+ docsSeen.cardinality(),
longsSize));
} else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
assert sumDocFreq == 0;
- assert docCount == 0;
+ assert docsSeen.cardinality() == 0;
}
}
Index: lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java (revision 1530744)
+++ lucene/core/src/java/org/apache/lucene/codecs/PushFieldsConsumer.java (working copy)
@@ -48,6 +48,7 @@
*
* @lucene.experimental
*/
+// nocommit remove?
public abstract class PushFieldsConsumer extends FieldsConsumer implements Closeable {
final SegmentWriteState writeState;
Index: lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (revision 0)
+++ lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (working copy)
@@ -0,0 +1,212 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Extension of {@link PostingsConsumer} to support pluggable term dictionaries.
+ *
+ * This class contains additional hooks to interact with the provided
+ * term dictionaries such as {@link BlockTreeTermsWriter}. If you want
+ * to re-use an existing implementation and are only interested in
+ * customizing the format of the postings list, extend this class
+ * instead.
+ *
+ * @see PostingsReaderBase
+ * @lucene.experimental
+ */
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
+public abstract class PushPostingsWriterBase extends PostingsWriterBase {
+
+ // Reused in writeTerm
+ private DocsEnum docsEnum;
+ private DocsAndPositionsEnum posEnum;
+ private int enumFlags;
+
+ protected FieldInfo fieldInfo;
+ protected IndexOptions indexOptions;
+ protected boolean writeFreqs;
+ protected boolean writePositions;
+ protected boolean writePayloads;
+ protected boolean writeOffsets;
+
+ /** Sole constructor. (For invocation by subclass
+ * constructors, typically implicit.) */
+ protected PushPostingsWriterBase() {
+ }
+
+ /** Called once after startup, before any terms have been
+ * added. Implementations typically write a header to
+ * the provided {@code termsOut}. */
+ public abstract void init(IndexOutput termsOut) throws IOException;
+
+ /** Return a newly created empty TermState */
+ public abstract BlockTermState newTermState() throws IOException;
+
+ /** Start a new term. Note that a matching call to {@link
+ * #finishTerm(BlockTermState)} is done, only if the term has at least one
+ * document. */
+ public abstract void startTerm() throws IOException;
+
+ /** Finishes the current term. The provided {@link
+ * BlockTermState} contains the term's summary statistics,
+ * and will holds metadata from PBF when returned */
+ public abstract void finishTerm(BlockTermState state) throws IOException;
+
+ /**
+ * Encode metadata as long[] and byte[]. {@code absolute} controls whether
+ * current term is delta encoded according to latest term.
+ * Usually elements in {@code longs} are file pointers, so each one always
+ * increases when a new term is consumed. {@code out} is used to write generic
+ * bytes, which are not monotonic.
+ *
+ * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
+ * the pointer to postings list may not be defined for some terms but is defined
+ * for others, if it is designed to inline some postings data in term dictionary.
+ * In this case, the postings writer should always use the last value, so that each
+ * element in metadata long[] remains monotonic.
+ */
+ public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+ /**
+ * Sets the current field for writing, and returns the
+ * fixed length of long[] metadata (which is fixed per
+ * field), called when the writing switches to another field. */
+ // TODO: better name?
+ public int setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ indexOptions = fieldInfo.getIndexOptions();
+
+ writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ writePayloads = fieldInfo.hasPayloads();
+
+ if (writeFreqs == false) {
+ enumFlags = 0;
+ } else if (writePositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (writeOffsets == false) {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
+ }
+
+ // nocommit awkward
+ return 0;
+ }
+
+ @Override
+ public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
+ startTerm();
+ if (writePositions == false) {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ }
+ assert docsEnum != null;
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docFreq++;
+ docsSeen.set(docID);
+ int freq;
+ if (writeFreqs) {
+ freq = docsEnum.freq();
+ totalTermFreq += freq;
+ } else {
+ // nocommit 1?
+ freq = -1;
+ }
+ startDoc(docID, freq);
+
+ if (writePositions) {
+ for(int i=0;ifreq will be -1 when term frequencies are omitted
+ * for the field. */
+ public abstract void startDoc(int docID, int freq) throws IOException;
+
+ /** Add a new position & payload, and start/end offset. A
+ * null payload means no payload; a non-null payload with
+ * zero length also means no payload. Caller may reuse
+ * the {@link BytesRef} for the payload between calls
+ * (method must fully consume the payload). startOffset
+ * and endOffset will be -1 when offsets are not indexed. */
+ public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
+
+ /** Called when we are done adding positions & payloads
+ * for each doc. */
+ public abstract void finishDoc() throws IOException;
+}
Property changes on: lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (revision 1530744)
+++ lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (working copy)
@@ -26,7 +26,6 @@
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
-import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.FieldInfos; // javadocs
Index: lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (working copy)
+++ lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (working copy)
@@ -17,32 +17,28 @@
* limitations under the License.
*/
-import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
-import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
-import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
-
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
+import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE;
+import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE;
+import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE;
+
/**
* Concrete class that writes docId(maybe frq,pos,offset,payloads) list
* with postings format.
@@ -52,7 +48,7 @@
* @see Lucene41SkipWriter for details about skipping setting and postings layout.
* @lucene.experimental
*/
-public final class Lucene41PostingsWriter extends PostingsWriterBase {
+public final class Lucene41PostingsWriter extends PushPostingsWriterBase {
/**
* Expert: The maximum number of skip levels. Smaller values result in
@@ -77,12 +73,6 @@
final static IntBlockTermState emptyState = new IntBlockTermState();
IntBlockTermState lastState;
- // How current field indexes postings:
- private boolean fieldHasFreqs;
- private boolean fieldHasPositions;
- private boolean fieldHasOffsets;
- private boolean fieldHasPayloads;
-
// Holds starting file pointers for current term:
private long docStartFP;
private long posStartFP;
@@ -241,15 +231,11 @@
@Override
public int setField(FieldInfo fieldInfo) {
- IndexOptions indexOptions = fieldInfo.getIndexOptions();
- fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
- fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
- fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
- fieldHasPayloads = fieldInfo.hasPayloads();
- skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
+ super.setField(fieldInfo);
+ skipWriter.setField(writePositions, writeOffsets, writePayloads);
lastState = emptyState;
- if (fieldHasPositions) {
- if (fieldHasPayloads || fieldHasOffsets) {
+ if (writePositions) {
+ if (writePayloads || writeOffsets) {
return 3; // doc + pos + pay FP
} else {
return 2; // doc + pos FP
@@ -262,9 +248,9 @@
@Override
public void startTerm() {
docStartFP = docOut.getFilePointer();
- if (fieldHasPositions) {
+ if (writePositions) {
posStartFP = posOut.getFilePointer();
- if (fieldHasPayloads || fieldHasOffsets) {
+ if (writePayloads || writeOffsets) {
payStartFP = payOut.getFilePointer();
}
}
@@ -301,7 +287,7 @@
// if (DEBUG) {
// System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta);
// }
- if (fieldHasFreqs) {
+ if (writeFreqs) {
freqBuffer[docBufferUpto] = termDocFreq;
}
docBufferUpto++;
@@ -312,7 +298,7 @@
// System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer());
// }
forUtil.writeBlock(docDeltaBuffer, encoded, docOut);
- if (fieldHasFreqs) {
+ if (writeFreqs) {
// if (DEBUG) {
// System.out.println(" write freq block @ fp=" + docOut.getFilePointer());
// }
@@ -329,14 +315,13 @@
lastStartOffset = 0;
}
- /** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
// if (DEBUG) {
- // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
+ // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (writePayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
// }
posDeltaBuffer[posBufferUpto] = position - lastPosition;
- if (fieldHasPayloads) {
+ if (writePayloads) {
if (payload == null || payload.length == 0) {
// no payload
payloadLengthBuffer[posBufferUpto] = 0;
@@ -350,7 +335,7 @@
}
}
- if (fieldHasOffsets) {
+ if (writeOffsets) {
assert startOffset >= lastStartOffset;
assert endOffset >= startOffset;
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
@@ -366,13 +351,13 @@
// }
forUtil.writeBlock(posDeltaBuffer, encoded, posOut);
- if (fieldHasPayloads) {
+ if (writePayloads) {
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
- if (fieldHasOffsets) {
+ if (writeOffsets) {
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
}
@@ -433,7 +418,7 @@
for(int i=0;i 0) {
- // System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posStartFP=" + posStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets);
+ // System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posStartFP=" + posStartFP + " hasPayloads=" + writePayloads + " hasOffsets=" + writeOffsets);
// }
// }
@@ -474,7 +459,7 @@
int payloadBytesReadUpto = 0;
for(int i=0;i 0) {
- if (pos.payload == null) {
- pos.payload = BytesRef.deepCopyOf(payload);
- } else {
- pos.payload.copyBytes(payload);
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ assert posEnum != null;
+ while (posCount <= maxPositions) {
+ if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+ break;
}
- } else if (pos.payload != null) {
- pos.payload.length = 0;
+ posCount += posEnum.freq();
}
}
- }
- @Override
- public void finishDoc() throws IOException {
- // if (DEBUG) System.out.println("PW finishDoc");
- if (pendingCount == -1) {
- wrappedPostingsWriter.finishDoc();
+ if (posCount == 0) {
+ // All docs were deleted
+ return null;
}
- }
- private final RAMOutputStream buffer = new RAMOutputStream();
+ // Second pass: write postings
+ if (posCount > maxPositions) {
+ // Too many positions; do not pulse. Just lset
+ // wrapped postingsWriter encode the postings:
- // private int baseDocID;
+ PulsingTermState state = new PulsingTermState();
+ state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen);
+ state.docFreq = state.wrappedState.docFreq;
+ state.totalTermFreq = state.wrappedState.totalTermFreq;
+ return state;
+ } else {
+ // Pulsed:
+ if (fieldHasPositions == false) {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ }
+ assert docsEnum != null;
- /** Called when we are done adding docs to this term */
- @Override
- public void finishTerm(BlockTermState _state) throws IOException {
- PulsingTermState state = (PulsingTermState) _state;
-
- // if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());
-
- assert pendingCount > 0 || pendingCount == -1;
-
- if (pendingCount == -1) {
- state.wrappedState.docFreq = state.docFreq;
- state.wrappedState.totalTermFreq = state.totalTermFreq;
- state.bytes = null;
- wrappedPostingsWriter.finishTerm(state.wrappedState);
- } else {
// There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
@@ -287,98 +202,138 @@
// given codec wants to store other interesting
// stuff, it could use this pulsing codec to do so
- if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
- int lastDocID = 0;
- int pendingIDX = 0;
- int lastPayloadLength = -1;
- int lastOffsetLength = -1;
- while(pendingIDX < pendingCount) {
- final Position doc = pending[pendingIDX];
+ int lastDocID = 0;
+ int lastPayloadLength = -1;
+ int lastOffsetLength = -1;
- final int delta = doc.docID - lastDocID;
- lastDocID = doc.docID;
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docsSeen.set(docID);
- // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq);
+ int delta = docID - lastDocID;
+ lastDocID = docID;
- if (doc.termFreq == 1) {
- buffer.writeVInt((delta<<1)|1);
+ docFreq++;
+
+ if (fieldHasFreqs) {
+ int freq = docsEnum.freq();
+ totalTermFreq += freq;
+
+ if (freq == 1) {
+ buffer.writeVInt((delta << 1) | 1);
} else {
- buffer.writeVInt(delta<<1);
- buffer.writeVInt(doc.termFreq);
+ buffer.writeVInt(delta << 1);
+ buffer.writeVInt(freq);
}
- int lastPos = 0;
- int lastOffset = 0;
- for(int posIDX=0;posIDX 0) {
+ assert fieldHasPayloads;
+ assert payload != null;
+ buffer.writeBytes(payload.bytes, payload.offset, payload.length);
}
- lastOffset = pos.startOffset;
- lastOffsetLength = offsetLength;
}
-
- if (payloadLength > 0) {
- assert storePayloads;
- buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
- }
}
+ } else {
+ buffer.writeVInt(delta);
}
- } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
- int lastDocID = 0;
- for(int posIDX=0;posIDX= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
skipListWriter.setIndexOptions(indexOptions);
- storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.hasPayloads();
lastPayloadFP = 0;
lastSkipFP = 0;
lastState = setEmptyState();
@@ -233,7 +228,7 @@
// TODO: -- awkward we have to make these two
// separate calls to skipper
//System.out.println(" buffer skip lastDocID=" + lastDocID);
- skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
@@ -254,7 +249,7 @@
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
- if (storePayloads) {
+ if (writePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
@@ -344,7 +339,7 @@
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
lastState.posIndex.copyFrom(state.posIndex, false);
lastState.posIndex.write(out, absolute);
- if (storePayloads) {
+ if (writePayloads) {
if (absolute) {
out.writeVLong(state.payloadFP);
} else {
Index: lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java (working copy)
@@ -17,26 +17,29 @@
* limitations under the License.
*/
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PushFieldsConsumer;
import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.BlockTermState;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@@ -52,7 +55,7 @@
* @lucene.experimental
*/
-public class BlockTermsWriter extends PushFieldsConsumer {
+public class BlockTermsWriter extends FieldsConsumer implements Closeable {
final static String CODEC_NAME = "BLOCK_TERMS_DICT";
@@ -70,6 +73,7 @@
final FieldInfos fieldInfos;
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
+ private final int maxDoc;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
@@ -99,9 +103,9 @@
public BlockTermsWriter(TermsIndexWriterBase termsIndexWriter,
SegmentWriteState state, PostingsWriterBase postingsWriter)
throws IOException {
- super(state);
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter;
+ maxDoc = state.segmentInfo.getDocCount();
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
try {
@@ -127,7 +131,43 @@
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
+ public void write(Fields fields) throws IOException {
+
+ boolean success = false;
+ try {
+ for(String field : fields) {
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ private TermsWriter addField(FieldInfo field) throws IOException {
//System.out.println("\nBTW.addField seg=" + segment + " field=" + field.name);
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
@@ -135,7 +175,6 @@
return new TermsWriter(fieldIndexWriter, field, postingsWriter);
}
- @Override
public void close() throws IOException {
try {
final long dirStart = out.getFilePointer();
@@ -169,12 +208,13 @@
public BlockTermState state;
}
- class TermsWriter extends TermsConsumer {
+ class TermsWriter {
private final FieldInfo fieldInfo;
private final PostingsWriterBase postingsWriter;
private final long termsStartPointer;
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+ private final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
@@ -191,6 +231,7 @@
{
this.fieldInfo = fieldInfo;
this.fieldIndexWriter = fieldIndexWriter;
+ this.docsSeen = new FixedBitSet(maxDoc);
pendingTerms = new TermEntry[32];
for(int i=0;i fields = new ArrayList();
IndexOutput blockOut = null;
IndexOutput indexOut = null;
public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
- super(state);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -180,11 +182,41 @@
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
+ } finally {
+ close();
+ }
}
- @Override
public void close() throws IOException {
IOException ioe = null;
try {
@@ -247,7 +279,7 @@
public RAMOutputStream metaBytesOut;
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder builder;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
@@ -284,34 +316,23 @@
this.lastMetaBytesFP = 0;
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
bufferSkip();
}
// write term meta data into fst
final long longs[] = new long[longsSize];
- final long delta = stats.totalTermFreq - stats.docFreq;
- if (stats.totalTermFreq > 0) {
+ final long delta = state.totalTermFreq - state.docFreq;
+ if (state.totalTermFreq > 0) {
if (delta == 0) {
- statsOut.writeVInt(stats.docFreq<<1|1);
+ statsOut.writeVInt(state.docFreq<<1|1);
} else {
- statsOut.writeVInt(stats.docFreq<<1|0);
- statsOut.writeVLong(stats.totalTermFreq-stats.docFreq);
+ statsOut.writeVInt(state.docFreq<<1|0);
+ statsOut.writeVLong(state.totalTermFreq-state.docFreq);
}
} else {
- statsOut.writeVInt(stats.docFreq);
+ statsOut.writeVInt(state.docFreq);
}
- BlockTermState state = postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
for (int i = 0; i < longsSize; i++) {
metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
@@ -325,7 +346,6 @@
lastMetaBytesFP = metaBytesOut.getFilePointer();
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (numTerms > 0) {
final FieldMetaData metadata = new FieldMetaData();
Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java (working copy)
@@ -25,7 +25,6 @@
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.Outputs;
-import org.apache.lucene.util.LongsRef;
/**
* An FST {@link Outputs} implementation for
@@ -89,6 +88,11 @@
}
@Override
+ public String toString() {
+ return "FSTTermOutputs$TermData longs=" + Arrays.toString(longs) + " bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq;
+ }
+
+ @Override
public boolean equals(Object other_) {
if (other_ == this) {
return true;
@@ -221,6 +225,7 @@
@Override
public void write(TermData data, DataOutput out) throws IOException {
+ assert hasPos || data.totalTermFreq == -1;
int bit0 = allZero(data.longs) ? 0 : 1;
int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1;
int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2;
Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java (working copy)
@@ -25,9 +25,8 @@
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java (working copy)
@@ -25,9 +25,8 @@
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPulsing41PostingsFormat.java (working copy)
@@ -25,12 +25,9 @@
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
-import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
+import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
-import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
Index: lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java (working copy)
@@ -23,20 +23,21 @@
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.PushFieldsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@@ -119,7 +120,7 @@
* @lucene.experimental
*/
-public class FSTTermsWriter extends PushFieldsConsumer {
+public class FSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
public static final int TERMS_VERSION_START = 0;
@@ -128,15 +129,16 @@
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
final IndexOutput out;
+ final int maxDoc;
final List fields = new ArrayList();
public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
- super(state);
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
this.out = state.directory.createOutput(termsFileName, state.context);
+ this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
@@ -149,19 +151,53 @@
}
}
}
+
private void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
}
+
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- return new TermsWriter(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ TermsEnum termsEnum = terms.iterator(null);
+ TermsWriter termsWriter = termsWriter = new TermsWriter(fieldInfo);
+
+ long sumTotalTermFreq = 0;
+ long sumDocFreq = 0;
+ FixedBitSet docsSeen = new FixedBitSet(maxDoc);
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+
+ BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
+ if (termState != null) {
+ termsWriter.finishTerm(term, termState);
+ sumTotalTermFreq += termState.totalTermFreq;
+ sumDocFreq += termState.docFreq;
+ }
+ }
+
+ termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
+ }
+ } finally {
+ close();
+ }
}
- @Override
public void close() throws IOException {
IOException ioe = null;
try {
@@ -208,7 +244,7 @@
}
}
- final class TermsWriter extends TermsConsumer {
+ final class TermsWriter {
private final Builder builder;
private final FSTTermOutputs outputs;
private final FieldInfo fieldInfo;
@@ -226,22 +262,13 @@
this.builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs);
}
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- postingsWriter.startTerm();
- return postingsWriter;
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+ public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
// write term meta data into fst
- final BlockTermState state = postingsWriter.newTermState();
final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
meta.longs = new long[longsSize];
meta.bytes = null;
- meta.docFreq = state.docFreq = stats.docFreq;
- meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
+ meta.docFreq = state.docFreq;
+ meta.totalTermFreq = state.totalTermFreq;
postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true);
final int bytesSize = (int)metaWriter.getFilePointer();
if (bytesSize > 0) {
@@ -253,7 +280,6 @@
numTerms++;
}
- @Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// save FST dict
if (numTerms > 0) {
Index: lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
===================================================================
--- lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (revision 1530744)
+++ lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (working copy)
@@ -28,15 +28,12 @@
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
-import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.PushFieldsConsumer;
-import org.apache.lucene.codecs.TermStats;
-import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.bloom.FuzzySet.ContainsResult;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@@ -118,9 +115,7 @@
* "blm" file. This PostingsFormat delegates to a choice of delegate
* PostingsFormat for encoding all other postings data. This choice of
* constructor defaults to the {@link DefaultBloomFilterFactory} for
- * configuring per-field BloomFilters. Note that the
- * wrapped PostingsFormat must use a {@link PushFieldsConsumer}
- * for writing.
+ * configuring per-field BloomFilters.
*
* @param delegatePostingsFormat
* The PostingsFormat that records all the non-bloom filter data i.e.
@@ -144,11 +139,7 @@
+ " has been constructed without a choice of PostingsFormat");
}
FieldsConsumer fieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
- if (!(fieldsConsumer instanceof PushFieldsConsumer)) {
- throw new UnsupportedOperationException("Wrapped PostingsFormat must return a PushFieldsConsumer");
- }
- return new BloomFilteredFieldsConsumer(
- (PushFieldsConsumer) fieldsConsumer, state);
+ return new BloomFilteredFieldsConsumer(fieldsConsumer, state);
}
@Override
@@ -310,7 +301,7 @@
this.delegateTermsEnum = null;
}
- private final TermsEnum delegate() throws IOException {
+ private TermsEnum delegate() throws IOException {
if (delegateTermsEnum == null) {
/* pull the iterator only if we really need it -
* this can be a relativly heavy operation depending on the
@@ -322,12 +313,12 @@
}
@Override
- public final BytesRef next() throws IOException {
+ public BytesRef next() throws IOException {
return delegate().next();
}
@Override
- public final boolean seekExact(BytesRef text)
+ public boolean seekExact(BytesRef text)
throws IOException {
// The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory
@@ -341,33 +332,33 @@
}
@Override
- public final SeekStatus seekCeil(BytesRef text)
+ public SeekStatus seekCeil(BytesRef text)
throws IOException {
return delegate().seekCeil(text);
}
@Override
- public final void seekExact(long ord) throws IOException {
+ public void seekExact(long ord) throws IOException {
delegate().seekExact(ord);
}
@Override
- public final BytesRef term() throws IOException {
+ public BytesRef term() throws IOException {
return delegate().term();
}
@Override
- public final long ord() throws IOException {
+ public long ord() throws IOException {
return delegate().ord();
}
@Override
- public final int docFreq() throws IOException {
+ public int docFreq() throws IOException {
return delegate().docFreq();
}
@Override
- public final long totalTermFreq() throws IOException {
+ public long totalTermFreq() throws IOException {
return delegate().totalTermFreq();
}
@@ -396,35 +387,60 @@
}
}
- class BloomFilteredFieldsConsumer extends PushFieldsConsumer {
- private PushFieldsConsumer delegateFieldsConsumer;
+ class BloomFilteredFieldsConsumer extends FieldsConsumer {
+ private FieldsConsumer delegateFieldsConsumer;
private Map bloomFilters = new HashMap();
private SegmentWriteState state;
- public BloomFilteredFieldsConsumer(PushFieldsConsumer fieldsConsumer,
+ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
SegmentWriteState state) {
- super(state);
this.delegateFieldsConsumer = fieldsConsumer;
this.state = state;
}
-
+
@Override
- public TermsConsumer addField(FieldInfo field) throws IOException {
- FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
- if (bloomFilter != null) {
- assert bloomFilters.containsKey(field) == false;
- bloomFilters.put(field, bloomFilter);
- return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field), bloomFilter);
- } else {
- // No, use the unfiltered fieldsConsumer - we are not interested in
- // recording any term Bitsets.
- return delegateFieldsConsumer.addField(field);
+ public void write(Fields fields) throws IOException {
+ try {
+ for(String field : fields) {
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ TermsEnum termsEnum = terms.iterator(null);
+
+ FuzzySet bloomFilter = null;
+
+ DocsEnum docsEnum = null;
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ if (bloomFilter == null) {
+ bloomFilter = bloomFilterFactory.getSetForField(state, fieldInfo);
+ if (bloomFilter == null) {
+ // Field not bloom'd
+ break;
+ }
+ assert bloomFilters.containsKey(field) == false;
+ bloomFilters.put(fieldInfo, bloomFilter);
+ }
+ // Make sure there's at least one doc for this term:
+ docsEnum = termsEnum.docs(null, docsEnum, 0);
+ if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
+ bloomFilter.addValue(term);
+ }
+ }
+ }
+ } finally {
+ close();
}
+
+ delegateFieldsConsumer.write(fields);
}
-
- @Override
+
public void close() throws IOException {
- delegateFieldsConsumer.close();
// Now we are done accumulating values for these fields
List> nonSaturatedBlooms = new ArrayList>();
@@ -470,37 +486,5 @@
}
rightSizedSet.serialize(bloomOutput);
}
-
}
-
- class WrappedTermsConsumer extends TermsConsumer {
- private TermsConsumer delegateTermsConsumer;
- private FuzzySet bloomFilter;
-
- public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
- this.delegateTermsConsumer = termsConsumer;
- this.bloomFilter = bloomFilter;
- }
-
- @Override
- public PostingsConsumer startTerm(BytesRef text) throws IOException {
- return delegateTermsConsumer.startTerm(text);
- }
-
- @Override
- public void finishTerm(BytesRef text, TermStats stats) throws IOException {
-
- // Record this term in our BloomFilter
- if (stats.docFreq > 0) {
- bloomFilter.addValue(text);
- }
- delegateTermsConsumer.finishTerm(text, stats);
- }
-
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
- throws IOException {
- delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
- }
- }
}
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockFixedIntBlockPostingsFormat.java (working copy)
@@ -37,8 +37,8 @@
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
+import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockintblock/MockVariableIntBlockPostingsFormat.java (working copy)
@@ -37,8 +37,8 @@
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
+import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40RWPostingsFormat.java (working copy)
@@ -1,13 +1,5 @@
package org.apache.lucene.codecs.lucene40;
-import java.io.IOException;
-
-import org.apache.lucene.codecs.BlockTreeTermsWriter;
-import org.apache.lucene.codecs.FieldsConsumer;
-import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.util.LuceneTestCase;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -25,6 +17,14 @@
* limitations under the License.
*/
+import java.io.IOException;
+
+import org.apache.lucene.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.LuceneTestCase;
+
/**
* Read-write version of {@link Lucene40PostingsFormat} for testing.
*/
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (working copy)
@@ -21,22 +21,18 @@
* index file format */
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.PostingsWriterBase;
-import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@@ -46,7 +42,7 @@
* @see Lucene40PostingsFormat
* @lucene.experimental
*/
-public final class Lucene40PostingsWriter extends PostingsWriterBase {
+public final class Lucene40PostingsWriter extends PushPostingsWriterBase {
final IndexOutput freqOut;
final IndexOutput proxOut;
@@ -70,13 +66,9 @@
final int maxSkipLevels = 10;
final int totalNumDocs;
- IndexOptions indexOptions;
- boolean storePayloads;
- boolean storeOffsets;
// Starts a new term
long freqStart;
long proxStart;
- FieldInfo fieldInfo;
int lastPayloadLength;
int lastOffsetLength;
int lastPosition;
@@ -150,7 +142,6 @@
return new StandardTermState();
}
-
@Override
public void startTerm() {
freqStart = freqOut.getFilePointer();
@@ -169,6 +160,7 @@
// our parent calls setField whenever the field changes
@Override
public int setField(FieldInfo fieldInfo) {
+ super.setField(fieldInfo);
//System.out.println("SPW: setField");
/*
if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) {
@@ -177,11 +169,7 @@
DEBUG = false;
}
*/
- this.fieldInfo = fieldInfo;
- indexOptions = fieldInfo.getIndexOptions();
-
- storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
- storePayloads = fieldInfo.hasPayloads();
+
lastState = emptyState;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
@@ -190,7 +178,7 @@
int lastDocID;
int df;
-
+
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
// if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());
@@ -202,7 +190,7 @@
}
if ((++df % skipInterval) == 0) {
- skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
+ skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength, writeOffsets, lastOffsetLength);
skipListWriter.bufferSkip(df);
}
@@ -237,7 +225,7 @@
int payloadLength = 0;
- if (storePayloads) {
+ if (writePayloads) {
payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
@@ -251,7 +239,7 @@
proxOut.writeVInt(delta);
}
- if (storeOffsets) {
+ if (writeOffsets) {
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
// and the numbers aren't that much smaller anyways.
int offsetDelta = startOffset - lastOffset;
@@ -285,7 +273,7 @@
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(BlockTermState _state) throws IOException {
- StandardTermState state = (StandardTermState)_state;
+ StandardTermState state = (StandardTermState) _state;
// if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
assert state.docFreq > 0;
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java (working copy)
@@ -40,6 +40,10 @@
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
+import org.apache.lucene.codecs.memory.FSTOrdTermsReader;
+import org.apache.lucene.codecs.memory.FSTOrdTermsWriter;
+import org.apache.lucene.codecs.memory.FSTTermsReader;
+import org.apache.lucene.codecs.memory.FSTTermsWriter;
import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat;
import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat;
import org.apache.lucene.codecs.mocksep.MockSingleIntFactory;
@@ -50,10 +54,6 @@
import org.apache.lucene.codecs.sep.IntStreamFactory;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
-import org.apache.lucene.codecs.memory.FSTTermsWriter;
-import org.apache.lucene.codecs.memory.FSTTermsReader;
-import org.apache.lucene.codecs.memory.FSTOrdTermsWriter;
-import org.apache.lucene.codecs.memory.FSTOrdTermsReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (working copy)
@@ -163,6 +163,7 @@
boolean hasFreqs = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS) >= 0;
boolean hasPositions = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean hasOffsets = fieldInfo.getIndexOptions().compareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ boolean hasPayloads = terms.hasPayloads();
assert hasPositions == terms.hasPositions();
assert hasOffsets == terms.hasOffsets();
@@ -179,14 +180,16 @@
lastTerm.copyBytes(term);
}
+ int flags = 0;
if (hasPositions == false) {
- int flags = 0;
if (hasFreqs) {
flags = flags | DocsEnum.FLAG_FREQS;
}
docsEnum = termsEnum.docs(null, docsEnum, flags);
} else {
- int flags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ if (hasPayloads) {
+ flags |= DocsAndPositionsEnum.FLAG_PAYLOADS;
+ }
if (hasOffsets) {
flags = flags | DocsAndPositionsEnum.FLAG_OFFSETS;
}
@@ -194,6 +197,8 @@
docsEnum = posEnum;
}
+ assert docsEnum != null : "termsEnum=" + termsEnum + " hasPositions=" + hasPositions;
+
int lastDocID = -1;
while(true) {
@@ -212,13 +217,13 @@
int lastStartOffset = -1;
for(int i=0;i lastPos;
+ assert pos >= lastPos: "pos=" + pos + " vs lastPos=" + lastPos + " i=" + i + " freq=" + freq;
lastPos = pos;
if (hasOffsets) {
int startOffset = posEnum.startOffset();
int endOffset = posEnum.endOffset();
- assert endOffset > startOffset;
+ assert endOffset >= startOffset;
assert startOffset >= lastStartOffset;
lastStartOffset = startOffset;
}
Index: lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/codecs/mocksep/MockSepPostingsFormat.java (working copy)
@@ -32,8 +32,8 @@
import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase;
import org.apache.lucene.codecs.sep.SepPostingsReader;
import org.apache.lucene.codecs.sep.SepPostingsWriter;
+import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
-import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.util.BytesRef;
/**
Index: lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (revision 1530744)
+++ lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (working copy)
@@ -538,7 +538,7 @@
@Override
public boolean hasPayloads() {
- return fieldInfo.hasPayloads();
+ return allowPayloads && fieldInfo.hasPayloads();
}
}
@@ -628,15 +628,12 @@
throw new IllegalArgumentException("liveDocs must be null");
}
if (maxAllowed.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
- System.out.println("no: max");
return null;
}
if ((flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0 && maxAllowed.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0) {
- System.out.println("no: offsets");
return null;
}
if ((flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0 && allowPayloads == false) {
- System.out.println("no: payloads");
return null;
}
return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads);