Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (revision 1516140) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (working copy) @@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.lucene.analysis.*; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -543,7 +544,7 @@ if (sawAppend && sawFlush) { break; } - if (FreqProxTermsWriterPerField.class.getName().equals(trace[i].getClassName()) && "flush".equals(trace[i].getMethodName())) { + if (PostingsFormat.class.getName().equals(trace[i].getClassName()) && "write".equals(trace[i].getMethodName())) { sawAppend = true; } if ("flush".equals(trace[i].getMethodName())) { Index: lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (revision 1516140) +++ lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (working copy) @@ -245,6 +245,7 @@ reader.close(); // Reopen + // nocommit should we set mBD=2 again? writer = new IndexWriter( directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())). Index: lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (revision 1516140) +++ lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (working copy) @@ -199,6 +199,9 @@ } if (random().nextInt(6) == 3) { + if (VERBOSE) { + System.out.println(" check positions"); + } final int freq = postings.freq(); assertTrue(freq >=1 && freq <= 4); for(int pos=0;pos= 0; + boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + assert hasPositions == terms.hasPositions(); + boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + assert hasOffsets == terms.hasOffsets(); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + + int flags = 0; + if (hasPositions == false) { + if (hasFreq) { + flags |= DocsEnum.FLAG_FREQS; + } + } else { + flags = DocsAndPositionsEnum.FLAG_PAYLOADS; + if (hasOffsets) { + flags |= DocsAndPositionsEnum.FLAG_OFFSETS; + } + } + + DocsEnum docsEnum = null; + DocsAndPositionsEnum docsAndPositionsEnum = null; + TermsEnum termsEnum = terms.iterator(null); + + while (true) { // for all terms in this field + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (hasPositions) { + docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum, flags); + docsEnum = docsAndPositionsEnum; + } else { + docsEnum = termsEnum.docs(null, docsEnum, flags); + docsAndPositionsEnum = null; + } + + PostingsConsumer postingsConsumer = termsConsumer.startTerm(term); + + // How many documents have this term: + int docFreq = 0; + + // How many times this term occurs: + long totalTermFreq = 0; + + while(true) { // for all docs in this field+term + int doc = docsEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + docFreq++; + visitedDocs.set(doc); + if (hasFreq) { + int freq = docsEnum.freq(); + postingsConsumer.startDoc(doc, freq); + totalTermFreq += freq; + + if (hasPositions) { + for(int i=0;i 0) { + termsConsumer.finishTerm(term, new TermStats(docFreq, hasFreq ? totalTermFreq : -1)); + sumTotalTermFreq += totalTermFreq; + sumDocFreq += docFreq; + } + } + + termsConsumer.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality()); + } + } + success = true; + } finally { + if (success) { + IOUtils.close(fieldsConsumer); + } else { + IOUtils.closeWhileHandlingException(fieldsConsumer); + } + } + } /** Writes a new segment */ public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (revision 1516140) +++ lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (working copy) @@ -70,79 +70,4 @@ /** Called when we are done adding positions & payloads * for each doc. */ public abstract void finishDoc() throws IOException; - - /** Default merge impl: append documents, mapping around - * deletes */ - public TermStats merge(final MergeState mergeState, IndexOptions indexOptions, final DocsEnum postings, final FixedBitSet visitedDocs) throws IOException { - - int df = 0; - long totTF = 0; - - if (indexOptions == IndexOptions.DOCS_ONLY) { - while(true) { - final int doc = postings.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - this.startDoc(doc, -1); - this.finishDoc(); - df++; - } - totTF = -1; - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { - while(true) { - final int doc = postings.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - final int freq = postings.freq(); - this.startDoc(doc, freq); - this.finishDoc(); - df++; - totTF += freq; - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; - while(true) { - final int doc = postingsEnum.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - final int freq = postingsEnum.freq(); - this.startDoc(doc, freq); - totTF += freq; - for(int i=0;i getComparator() throws IOException; - - private MappingMultiDocsEnum docsEnum; - private MappingMultiDocsEnum docsAndFreqsEnum; - private MappingMultiDocsAndPositionsEnum postingsEnum; - - /** Default merge impl */ - public void merge(MergeState mergeState, IndexOptions indexOptions, TermsEnum termsEnum) throws IOException { - - BytesRef term; - assert termsEnum != null; - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - long sumDFsinceLastAbortCheck = 0; - FixedBitSet visitedDocs = new FixedBitSet(mergeState.segmentInfo.getDocCount()); - - if (indexOptions == IndexOptions.DOCS_ONLY) { - if (docsEnum == null) { - docsEnum = new MappingMultiDocsEnum(); - } - docsEnum.setMergeState(mergeState); - - MultiDocsEnum docsEnumIn = null; - - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - docsEnumIn = (MultiDocsEnum) termsEnum.docs(null, docsEnumIn, DocsEnum.FLAG_NONE); - if (docsEnumIn != null) { - docsEnum.reset(docsEnumIn); - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, docsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.docFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { - if (docsAndFreqsEnum == null) { - docsAndFreqsEnum = new MappingMultiDocsEnum(); - } - docsAndFreqsEnum.setMergeState(mergeState); - - MultiDocsEnum docsAndFreqsEnumIn = null; - - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - docsAndFreqsEnumIn = (MultiDocsEnum) termsEnum.docs(null, docsAndFreqsEnumIn); - assert docsAndFreqsEnumIn != null; - docsAndFreqsEnum.reset(docsAndFreqsEnumIn); - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, docsAndFreqsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - if (postingsEnum == null) { - postingsEnum = new MappingMultiDocsAndPositionsEnum(); - } - postingsEnum.setMergeState(mergeState); - MultiDocsAndPositionsEnum postingsEnumIn = null; - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS); - assert postingsEnumIn != null; - postingsEnum.reset(postingsEnumIn); - - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, postingsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } else { - assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; - if (postingsEnum == null) { - postingsEnum = new MappingMultiDocsAndPositionsEnum(); - } - postingsEnum.setMergeState(mergeState); - MultiDocsAndPositionsEnum postingsEnumIn = null; - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn); - assert postingsEnumIn != null; - postingsEnum.reset(postingsEnumIn); - - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, postingsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } - finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); - } } Index: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (revision 1516140) +++ lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -108,7 +108,7 @@ /** Collapse the hash table & sort in-place. */ public int[] sortPostings(Comparator termComp) { - return bytesHash.sort(termComp); + return bytesHash.sort(termComp); } private boolean doCall; @@ -136,7 +136,8 @@ // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into - // textStart, so we hash by textStart + // textStart, so we hash by textStart. term vectors use + // this API. public void add(int textStart) throws IOException { int termID = bytesHash.addByPoolOffset(textStart); if (termID >= 0) { // New posting @@ -173,7 +174,8 @@ } } - // Primary entry point (for first TermsHash) + // Primary entry point (for first TermsHash); postings use + // this API. @Override void add() throws IOException { Index: lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java (revision 1516140) +++ lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -22,9 +22,10 @@ import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.FieldInfosWriter; import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MappedMultiFields; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo.DocValuesType; @@ -375,7 +376,15 @@ docBase += maxDoc; } - final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState); + //final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState); + + Fields mergedFields = new MappedMultiFields(mergeState, + new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), + slices.toArray(ReaderSlice.EMPTY_ARRAY))); + + codec.postingsFormat().write(mergedFields, segmentWriteState); + + /* boolean success = false; try { consumer.merge(mergeState, @@ -389,5 +398,6 @@ IOUtils.closeWhileHandlingException(consumer); } } + */ } } Index: lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (working copy) @@ -0,0 +1,549 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** Implements {@link Fields} interface over the in-RAM + * buffered fields/terms/postings, to flush postings + * through the PostingsFormat. */ + +// nocommit should we factor out an "only the iterables" +// part of Fields/Terms/*Enums? ie, instead of throwing exc +// from all stats? we can't really support the stats +// because they'd require an extra pass during merging +// ... so this is really just "iterables" + +class FreqProxFields extends Fields { + final Map fields = new LinkedHashMap(); + + public FreqProxFields(List fieldList) { + // NOTE: fields are already sorted by field name + for(FreqProxTermsWriterPerField field : fieldList) { + fields.put(field.fieldInfo.name, field); + } + } + + public Iterator iterator() { + return fields.keySet().iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + FreqProxTermsWriterPerField perField = fields.get(field); + return perField == null ? null : new FreqProxTerms(perField); + } + + @Override + public int size() { + return fields.size(); + } + + private static class FreqProxTerms extends Terms { + final FreqProxTermsWriterPerField terms; + + public FreqProxTerms(FreqProxTermsWriterPerField terms) { + this.terms = terms; + } + + @Override + public TermsEnum iterator(TermsEnum reuse) { + FreqProxTermsEnum termsEnum; + if (reuse instanceof FreqProxTermsEnum && ((FreqProxTermsEnum) reuse).terms == this.terms) { + termsEnum = (FreqProxTermsEnum) reuse; + } else { + termsEnum = new FreqProxTermsEnum(terms); + } + termsEnum.reset(); + return termsEnum; + } + + // nocommit in old TermsConsumer API, consumer could + // override comparator ... but I think we just nuke/lose + // this. + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public long size() { + // nocommit throw exc instead? merging cannot do this: + return terms.termsHashPerField.bytesHash.size(); + } + + @Override + public long getSumTotalTermFreq() { + // nocommit throw exc instead? merging cannot do this: + return terms.sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + // nocommit throw exc instead? merging cannot do this: + return terms.sumDocFreq; + } + + @Override + public int getDocCount() { + // nocommit throw exc instead? merging cannot do this: + return terms.docCount; + } + + @Override + public boolean hasOffsets() { + // NOTE: the in-memory buffer may have indexed offsets + // because that's what FieldInfo said when we started, + // but during indexing this may have been downgraded: + return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + // NOTE: the in-memory buffer may have indexed positions + // because that's what FieldInfo said when we started, + // but during indexing this may have been downgraded: + return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return terms.hasPayloads; + } + } + + private static class FreqProxTermsEnum extends TermsEnum { + final FreqProxTermsWriterPerField terms; + final int[] sortedTermIDs; + final FreqProxPostingsArray postingsArray; + final BytesRef scratch = new BytesRef(); + BytesRef currentTerm; + final int numTerms; + int ord; + + public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) { + this.terms = terms; + this.numTerms = terms.termsHashPerField.bytesHash.size(); + sortedTermIDs = terms.sortedTermIDs; + assert sortedTermIDs != null; + postingsArray = (FreqProxPostingsArray) terms.termsHashPerField.postingsArray; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + public void reset() { + ord = -1; + } + + // nocommit impl seekExact(BytesRef)? + + public SeekStatus seekCeil(BytesRef text) { + + // nocommit we could also ... keep the BytesRefHash + // intact so this is a hash lookup + + // binary search: + int lo = 0; + int hi = numTerms - 1; + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int textStart = postingsArray.textStarts[sortedTermIDs[mid]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + int cmp = scratch.compareTo(text); + if (cmp < 0) { + lo = mid + 1; + } else if (cmp > 0) { + hi = mid - 1; + } else { + // found: + ord = mid; + currentTerm = scratch; + return SeekStatus.FOUND; + } + } + + // not found: + ord = lo + 1; + if (ord == numTerms) { + currentTerm = null; + return SeekStatus.END; + } else { + // nocommit do i need to do this again? loop may + // not end on this term? + //int textStart = postingsArray.textStarts[sortedTermIDs[mid]]; + //termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + return SeekStatus.NOT_FOUND; + } + } + + public void seekExact(long ord) { + this.ord = (int) ord; + int textStart = postingsArray.textStarts[sortedTermIDs[this.ord]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + } + + // nocommit seekExact(term, termState)? + + @Override + public BytesRef next() { + ord++; + if (ord >= numTerms) { + currentTerm = null; + } else { + int textStart = postingsArray.textStarts[sortedTermIDs[ord]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + } + return currentTerm; + } + + @Override + public BytesRef term() { + // nocommit can we just return scratch? + return currentTerm; + } + + @Override + public long ord() { + return ord; + } + + @Override + public int docFreq() { + // We do not store this per-term, and we cannot + // implement this at merge time w/o an added pass + // through the postings: + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() { + // We do not store this per-term, and we cannot + // implement this at merge time w/o an added pass + // through the postings: + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) { + FreqProxDocsEnum docsEnum; + + if (!terms.hasFreq && (flags & DocsEnum.FLAG_FREQS) != 0) { + // Caller wants freqs but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index freq"); + } + + if (reuse instanceof FreqProxDocsEnum) { + docsEnum = (FreqProxDocsEnum) reuse; + if (docsEnum.postingsArray != postingsArray) { + docsEnum = new FreqProxDocsEnum(terms, postingsArray); + } + } else { + docsEnum = new FreqProxDocsEnum(terms, postingsArray); + } + docsEnum.reset(sortedTermIDs[ord]); + return docsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { + FreqProxDocsAndPositionsEnum posEnum; + + if (!terms.hasProx) { + // Caller wants positions but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index positions"); + } + + if (!terms.hasOffsets && (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0) { + // Caller wants offsets but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index offsets"); + } + + if (reuse instanceof FreqProxDocsAndPositionsEnum) { + posEnum = (FreqProxDocsAndPositionsEnum) reuse; + if (posEnum.postingsArray != postingsArray) { + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); + } + } else { + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); + } + posEnum.reset(sortedTermIDs[ord]); + return posEnum; + } + + /** + * Expert: Returns the TermsEnums internal state to position the TermsEnum + * without re-seeking the term dictionary. + *

+ * NOTE: A seek by {@link TermState} might not capture the + * {@link AttributeSource}'s state. Callers must maintain the + * {@link AttributeSource} states separately + * + * @see TermState + * @see #seekExact(BytesRef, TermState) + */ + public TermState termState() throws IOException { + return new TermState() { + @Override + public void copyFrom(TermState other) { + throw new UnsupportedOperationException(); + } + }; + } + } + + private static class FreqProxDocsEnum extends DocsEnum { + + final FreqProxTermsWriterPerField terms; + final FreqProxPostingsArray postingsArray; + final ByteSliceReader reader = new ByteSliceReader(); + final boolean readTermFreq; + int docID; + int freq; + boolean ended; + int termID; + + public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { + this.terms = terms; + this.postingsArray = postingsArray; + this.readTermFreq = terms.hasFreq; + } + + public void reset(int termID) { + this.termID = termID; + terms.termsHashPerField.initReader(reader, termID, 0); + ended = false; + docID = 0; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + // Don't lie here ... don't want codecs writings lots + // of wasted 1s into the index: + if (!readTermFreq) { + throw new IllegalStateException("freq was not indexed"); + } else { + return freq; + } + } + + @Override + public int nextDoc() throws IOException { + if (reader.eof()) { + if (ended) { + return NO_MORE_DOCS; + } else { + ended = true; + docID = postingsArray.lastDocIDs[termID]; + if (readTermFreq) { + freq = postingsArray.termFreqs[termID]; + } + } + } else { + int code = reader.readVInt(); + if (!readTermFreq) { + docID += code; + } else { + docID += code >>> 1; + if ((code & 1) != 0) { + freq = 1; + } else { + freq = reader.readVInt(); + } + } + + assert docID != postingsArray.lastDocIDs[termID]; + } + + return docID; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + } + + private static class FreqProxDocsAndPositionsEnum extends DocsAndPositionsEnum { + + final FreqProxTermsWriterPerField terms; + final FreqProxPostingsArray postingsArray; + final ByteSliceReader reader = new ByteSliceReader(); + final ByteSliceReader posReader = new ByteSliceReader(); + final boolean readOffsets; + int docID; + int freq; + int pos; + int startOffset; + int endOffset; + int posLeft; + int termID; + boolean ended; + boolean hasPayload; + BytesRef payload = new BytesRef(); + + public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { + this.terms = terms; + this.postingsArray = postingsArray; + this.readOffsets = terms.hasOffsets; + assert terms.hasProx; + assert terms.hasFreq; + } + + public void reset(int termID) { + this.termID = termID; + terms.termsHashPerField.initReader(reader, termID, 0); + terms.termsHashPerField.initReader(posReader, termID, 1); + ended = false; + docID = 0; + posLeft = 0; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int nextDoc() throws IOException { + while (posLeft-- != 0) { + nextPosition(); + } + + if (reader.eof()) { + if (ended) { + return NO_MORE_DOCS; + } else { + ended = true; + docID = postingsArray.lastDocIDs[termID]; + freq = postingsArray.termFreqs[termID]; + } + } else { + int code = reader.readVInt(); + docID += code >>> 1; + if ((code & 1) != 0) { + freq = 1; + } else { + freq = reader.readVInt(); + } + + assert docID != postingsArray.lastDocIDs[termID]; + } + + posLeft = freq; + pos = 0; + startOffset = 0; + return docID; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextPosition() throws IOException { + assert posLeft > 0; + posLeft--; + int code = posReader.readVInt(); + pos += code >>> 1; + if ((code & 1) != 0) { + hasPayload = true; + // has a payload + payload.length = posReader.readVInt(); + if (payload.bytes.length < payload.length) { + payload.grow(payload.length); + } + posReader.readBytes(payload.bytes, 0, payload.length); + } else { + hasPayload = false; + } + + if (readOffsets) { + startOffset += posReader.readVInt(); + endOffset = startOffset + posReader.readVInt(); + } + + return pos; + } + + @Override + public int startOffset() { + if (!readOffsets) { + throw new IllegalStateException("offsets were not indexed"); + } + return startOffset; + } + + @Override + public int endOffset() { + if (!readOffsets) { + throw new IllegalStateException("offsets were not indexed"); + } + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (hasPayload) { + return payload; + } else { + return null; + } + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 1516140) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -19,6 +19,8 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Map; @@ -32,6 +34,50 @@ @Override void abort() {} + private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException { + // Process any pending Term deletes for this newly + // flushed segment: + if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { + Map segDeletes = state.segDeletes.terms; + List deleteTerms = new ArrayList(segDeletes.keySet()); + Collections.sort(deleteTerms); + String lastField = null; + TermsEnum termsEnum = null; + DocsEnum docsEnum = null; + for(Term deleteTerm : deleteTerms) { + if (deleteTerm.field().equals(lastField) == false) { + lastField = deleteTerm.field(); + Terms terms = fields.terms(lastField); + if (terms != null) { + termsEnum = terms.iterator(termsEnum); + } + } + + if (termsEnum != null && termsEnum.seekExact(deleteTerm.bytes())) { + docsEnum = termsEnum.docs(null, docsEnum, 0); + int delDocLimit = segDeletes.get(deleteTerm); + while (true) { + int doc = docsEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + if (doc < delDocLimit) { + if (state.liveDocs == null) { + state.liveDocs = state.segmentInfo.getCodec().liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount()); + } + if (state.liveDocs.get(doc)) { + state.delCountOnFlush++; + state.liveDocs.clear(doc); + } + } else { + break; + } + } + } + } + } + } + // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. @@ -44,9 +90,13 @@ // ThreadStates List allFields = new ArrayList(); + Comparator termComp = BytesRef.getUTF8SortedAsUnicodeComparator(); + for (TermsHashConsumerPerField f : fieldsToFlush.values()) { final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f; if (perField.termsHashPerField.bytesHash.size() > 0) { + perField.sortPostings(termComp); + assert perField.fieldInfo.isIndexed(); allFields.add(perField); } } @@ -56,53 +106,30 @@ // Sort by field name CollectionUtil.introSort(allFields); - final FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state); + Fields fields = new FreqProxFields(allFields); - boolean success = false; + applyDeletes(state, fields); - try { - TermsHash termsHash = null; + state.segmentInfo.getCodec().postingsFormat().write(fields, state); + + TermsHash termsHash = null; - /* - Current writer chain: - FieldsConsumer - -> IMPL: FormatPostingsTermsDictWriter - -> TermsConsumer - -> IMPL: FormatPostingsTermsDictWriter.TermsWriter - -> DocsConsumer - -> IMPL: FormatPostingsDocsWriter - -> PositionsConsumer - -> IMPL: FormatPostingsPositionsWriter - */ - - for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { - final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo; + for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { + final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo; - final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber); + final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber); - // If this field has postings then add them to the - // segment - fieldWriter.flush(fieldInfo.name, consumer, state); - - TermsHashPerField perField = fieldWriter.termsHashPerField; - assert termsHash == null || termsHash == perField.termsHash; - termsHash = perField.termsHash; - int numPostings = perField.bytesHash.size(); - perField.reset(); - perField.shrinkHash(numPostings); - fieldWriter.reset(); - } + TermsHashPerField perField = fieldWriter.termsHashPerField; + assert termsHash == null || termsHash == perField.termsHash; + termsHash = perField.termsHash; + int numPostings = perField.bytesHash.size(); + perField.reset(); + perField.shrinkHash(numPostings); + fieldWriter.reset(); + } - if (termsHash != null) { - termsHash.reset(); - } - success = true; - } finally { - if (success) { - IOUtils.close(consumer); - } else { - IOUtils.closeWhileHandlingException(consumer); - } + if (termsHash != null) { + termsHash.reset(); } } Index: lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1516140) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy) @@ -42,12 +42,17 @@ final FieldInfo fieldInfo; final DocumentsWriterPerThread.DocState docState; final FieldInvertState fieldState; - private boolean hasFreq; - private boolean hasProx; - private boolean hasOffsets; + boolean hasFreq; + boolean hasProx; + boolean hasOffsets; PayloadAttribute payloadAttribute; OffsetAttribute offsetAttribute; + long sumTotalTermFreq; + long sumDocFreq; + // How many docs have this field: + int docCount; + public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.parent = parent; @@ -68,6 +73,16 @@ @Override void finish() { + sumDocFreq += fieldState.uniqueTermCount; + sumTotalTermFreq += fieldState.length; + if (fieldState.length > 0) { + docCount++; + } + + // nocommit we don't expose "overlaps" in any statistic + // today ... should we? i guess we cannot in general + // since we don't save this in the index + if (hasPayloads) { fieldInfo.setStorePayloads(); } @@ -83,6 +98,8 @@ return fieldInfo.name.compareTo(other.fieldInfo.name); } + // nocommit is this really called? we don't reuse? + // Called after flush void reset() { // Record, up front, whether our in-RAM format will be @@ -318,233 +335,10 @@ BytesRef payload; - /* Walk through all unique text tokens (Posting - * instances) found in this field and serialize them - * into a single RAM segment. */ - void flush(String fieldName, FieldsConsumer consumer, final SegmentWriteState state) - throws IOException { + int[] sortedTermIDs; - if (!fieldInfo.isIndexed()) { - return; // nothing to flush, don't bother the codec with the unindexed field - } - - final TermsConsumer termsConsumer = consumer.addField(fieldInfo); - final Comparator termComp = termsConsumer.getComparator(); - - // CONFUSING: this.indexOptions holds the index options - // that were current when we first saw this field. But - // it's possible this has changed, eg when other - // documents are indexed that cause a "downgrade" of the - // IndexOptions. So we must decode the in-RAM buffer - // according to this.indexOptions, but then write the - // new segment to the directory according to - // currentFieldIndexOptions: - final IndexOptions currentFieldIndexOptions = fieldInfo.getIndexOptions(); - assert currentFieldIndexOptions != null; - - final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - - final boolean readTermFreq = this.hasFreq; - final boolean readPositions = this.hasProx; - final boolean readOffsets = this.hasOffsets; - - //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); - - // Make sure FieldInfo.update is working correctly!: - assert !writeTermFreq || readTermFreq; - assert !writePositions || readPositions; - assert !writeOffsets || readOffsets; - - assert !writeOffsets || writePositions; - - final Map segDeletes; - if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { - segDeletes = state.segDeletes.terms; - } else { - segDeletes = null; - } - - final int[] termIDs = termsHashPerField.sortPostings(termComp); - final int numTerms = termsHashPerField.bytesHash.size(); - final BytesRef text = new BytesRef(); - final FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray; - final ByteSliceReader freq = new ByteSliceReader(); - final ByteSliceReader prox = new ByteSliceReader(); - - FixedBitSet visitedDocs = new FixedBitSet(state.segmentInfo.getDocCount()); - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - - Term protoTerm = new Term(fieldName); - for (int i = 0; i < numTerms; i++) { - final int termID = termIDs[i]; - //System.out.println("term=" + termID); - // Get BytesRef - final int textStart = postings.textStarts[termID]; - termsHashPerField.bytePool.setBytesRef(text, textStart); - - termsHashPerField.initReader(freq, termID, 0); - if (readPositions || readOffsets) { - termsHashPerField.initReader(prox, termID, 1); - } - - // TODO: really TermsHashPerField should take over most - // of this loop, including merge sort of terms from - // multiple threads and interacting with the - // TermsConsumer, only calling out to us (passing us the - // DocsConsumer) to handle delivery of docs/positions - - final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); - - final int delDocLimit; - if (segDeletes != null) { - protoTerm.bytes = text; - final Integer docIDUpto = segDeletes.get(protoTerm); - if (docIDUpto != null) { - delDocLimit = docIDUpto; - } else { - delDocLimit = 0; - } - } else { - delDocLimit = 0; - } - - // Now termStates has numToMerge FieldMergeStates - // which all share the same term. Now we must - // interleave the docID streams. - int docFreq = 0; - long totalTermFreq = 0; - int docID = 0; - - while(true) { - //System.out.println(" cycle"); - final int termFreq; - if (freq.eof()) { - if (postings.lastDocCodes[termID] != -1) { - // Return last doc - docID = postings.lastDocIDs[termID]; - if (readTermFreq) { - termFreq = postings.termFreqs[termID]; - } else { - termFreq = -1; - } - postings.lastDocCodes[termID] = -1; - } else { - // EOF - break; - } - } else { - final int code = freq.readVInt(); - if (!readTermFreq) { - docID += code; - termFreq = -1; - } else { - docID += code >>> 1; - if ((code & 1) != 0) { - termFreq = 1; - } else { - termFreq = freq.readVInt(); - } - } - - assert docID != postings.lastDocIDs[termID]; - } - - docFreq++; - assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount(); - - // NOTE: we could check here if the docID was - // deleted, and skip it. However, this is somewhat - // dangerous because it can yield non-deterministic - // behavior since we may see the docID before we see - // the term that caused it to be deleted. This - // would mean some (but not all) of its postings may - // make it into the index, which'd alter the docFreq - // for those terms. We could fix this by doing two - // passes, ie first sweep marks all del docs, and - // 2nd sweep does the real flush, but I suspect - // that'd add too much time to flush. - visitedDocs.set(docID); - postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1); - if (docID < delDocLimit) { - // Mark it deleted. TODO: we could also skip - // writing its postings; this would be - // deterministic (just for this Term's docs). - - // TODO: can we do this reach-around in a cleaner way???? - if (state.liveDocs == null) { - state.liveDocs = docState.docWriter.codec.liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount()); - } - if (state.liveDocs.get(docID)) { - state.delCountOnFlush++; - state.liveDocs.clear(docID); - } - } - - totalTermFreq += termFreq; - - // Carefully copy over the prox + payload info, - // changing the format to match Lucene's segment - // format. - - if (readPositions || readOffsets) { - // we did record positions (& maybe payload) and/or offsets - int position = 0; - int offset = 0; - for(int j=0;j>> 1; - - if ((code & 1) != 0) { - - // This position has a payload - final int payloadLength = prox.readVInt(); - - if (payload == null) { - payload = new BytesRef(); - payload.bytes = new byte[payloadLength]; - } else if (payload.bytes.length < payloadLength) { - payload.grow(payloadLength); - } - - prox.readBytes(payload.bytes, 0, payloadLength); - payload.length = payloadLength; - thisPayload = payload; - - } else { - thisPayload = null; - } - - if (readOffsets) { - final int startOffset = offset + prox.readVInt(); - final int endOffset = startOffset + prox.readVInt(); - if (writePositions) { - if (writeOffsets) { - assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset; - postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset); - } else { - postingsConsumer.addPosition(position, thisPayload, -1, -1); - } - } - offset = startOffset; - } else if (writePositions) { - postingsConsumer.addPosition(position, thisPayload, -1, -1); - } - } - } - } - postingsConsumer.finishDoc(); - } - termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); - sumTotalTermFreq += totalTermFreq; - sumDocFreq += docFreq; - } - - termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality()); + void sortPostings(Comparator termComp) { + assert sortedTermIDs == null; + sortedTermIDs = termsHashPerField.sortPostings(termComp); } }