Index: lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (revision 1516845) +++ lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (working copy) @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.EnumSet; import java.util.HashSet; import java.util.Iterator; @@ -30,6 +31,7 @@ import java.util.NoSuchElementException; import java.util.Random; import java.util.Set; +import java.util.SortedMap; import java.util.TreeMap; import org.apache.lucene.codecs.Codec; @@ -126,6 +128,7 @@ private final BytesRef payload; private final IndexOptions options; private final boolean doPositions; + private final boolean allowPayloads; private int docID; private int freq; @@ -138,11 +141,12 @@ private int posSpacing; private int posUpto; - public SeedPostings(long seed, int minDocFreq, int maxDocFreq, Bits liveDocs, IndexOptions options) { + public SeedPostings(long seed, int minDocFreq, int maxDocFreq, Bits liveDocs, IndexOptions options, boolean allowPayloads) { random = new Random(seed); docRandom = new Random(random.nextLong()); docFreq = _TestUtil.nextInt(random, minDocFreq, maxDocFreq); this.liveDocs = liveDocs; + this.allowPayloads = allowPayloads; // TODO: more realistic to inversely tie this to numDocs: maxDocSpacing = _TestUtil.nextInt(random, 1, 100); @@ -249,6 +253,9 @@ } else { payload.length = 0; } + if (!allowPayloads) { + payload.length = 0; + } startOffset = offset + random.nextInt(5); endOffset = startOffset + random.nextInt(10); @@ -295,7 +302,7 @@ } // Holds all postings: - private static Map> fields; + private static Map> fields; private static FieldInfos fieldInfos; @@ -307,7 +314,7 @@ private static long totalPostings; private static long totalPayloadBytes; - private static SeedPostings getSeedPostings(String term, long seed, boolean withLiveDocs, IndexOptions options) { + private static SeedPostings getSeedPostings(String term, long seed, boolean withLiveDocs, IndexOptions options, boolean allowPayloads) { int minDocFreq, maxDocFreq; if (term.startsWith("big_")) { minDocFreq = RANDOM_MULTIPLIER * 50000; @@ -323,14 +330,14 @@ maxDocFreq = 3; } - return new SeedPostings(seed, minDocFreq, maxDocFreq, withLiveDocs ? globalLiveDocs : null, options); + return new SeedPostings(seed, minDocFreq, maxDocFreq, withLiveDocs ? globalLiveDocs : null, options, allowPayloads); } @BeforeClass public static void createPostings() throws IOException { totalPostings = 0; totalPayloadBytes = 0; - fields = new TreeMap>(); + fields = new TreeMap>(); final int numFields = _TestUtil.nextInt(random(), 1, 5); if (VERBOSE) { @@ -351,7 +358,7 @@ null, DocValuesType.NUMERIC, null); fieldUpto++; - Map postings = new TreeMap(); + SortedMap postings = new TreeMap(); fields.put(field, postings); Set seenTerms = new HashSet(); @@ -388,7 +395,7 @@ // NOTE: sort of silly: we enum all the docs just to // get the maxDoc - DocsEnum docsEnum = getSeedPostings(term, termSeed, false, IndexOptions.DOCS_ONLY); + DocsEnum docsEnum = getSeedPostings(term, termSeed, false, IndexOptions.DOCS_ONLY, true); int doc; int lastDoc = 0; while((doc = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) { @@ -412,7 +419,7 @@ } allTerms = new ArrayList(); - for(Map.Entry> fieldEnt : fields.entrySet()) { + for(Map.Entry> fieldEnt : fields.entrySet()) { String field = fieldEnt.getKey(); for(Map.Entry termEnt : fieldEnt.getValue().entrySet()) { allTerms.add(new FieldAndTerm(field, termEnt.getKey())); @@ -432,6 +439,203 @@ globalLiveDocs = null; } + private static class SeedFields extends Fields { + final Map> fields; + final FieldInfos fieldInfos; + final IndexOptions maxAllowed; + final boolean allowPayloads; + + public SeedFields(Map> fields, FieldInfos fieldInfos, IndexOptions maxAllowed, boolean allowPayloads) { + this.fields = fields; + this.fieldInfos = fieldInfos; + this.maxAllowed = maxAllowed; + this.allowPayloads = allowPayloads; + } + + @Override + public Iterator iterator() { + return fields.keySet().iterator(); + } + + @Override + public Terms terms(String field) { + SortedMap terms = fields.get(field); + if (terms == null) { + return null; + } else { + return new SeedTerms(terms, fieldInfos.fieldInfo(field), maxAllowed, allowPayloads); + } + } + + @Override + public int size() { + return fields.size(); + } + } + + private static class SeedTerms extends Terms { + final SortedMap terms; + final FieldInfo fieldInfo; + final IndexOptions maxAllowed; + final boolean allowPayloads; + + public SeedTerms(SortedMap terms, FieldInfo fieldInfo, IndexOptions maxAllowed, boolean allowPayloads) { + this.terms = terms; + this.fieldInfo = fieldInfo; + this.maxAllowed = maxAllowed; + this.allowPayloads = allowPayloads; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public TermsEnum iterator(TermsEnum reuse) { + SeedTermsEnum termsEnum; + if (reuse != null && reuse instanceof SeedTermsEnum) { + termsEnum = (SeedTermsEnum) reuse; + if (termsEnum.terms != terms) { + termsEnum = new SeedTermsEnum(terms, maxAllowed, allowPayloads); + } + } else { + termsEnum = new SeedTermsEnum(terms, maxAllowed, allowPayloads); + } + termsEnum.reset(); + + return termsEnum; + } + + @Override + public long size() { + return terms.size(); + } + + @Override + public long getSumTotalTermFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumDocFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public int getDocCount() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + } + + private static class SeedTermsEnum extends TermsEnum { + final SortedMap terms; + final IndexOptions maxAllowed; + final boolean allowPayloads; + + private Iterator> iterator; + + private Map.Entry current; + + public SeedTermsEnum(SortedMap terms, IndexOptions maxAllowed, boolean allowPayloads) { + this.terms = terms; + this.maxAllowed = maxAllowed; + this.allowPayloads = allowPayloads; + } + + void reset() { + iterator = terms.entrySet().iterator(); + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) { + SortedMap tailMap = terms.tailMap(text); + if (tailMap.isEmpty()) { + return SeekStatus.END; + } else { + iterator = tailMap.entrySet().iterator(); + if (tailMap.firstKey().equals(text)) { + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + } + + @Override + public BytesRef next() { + if (iterator.hasNext()) { + current = iterator.next(); + return term(); + } else { + return null; + } + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef term() { + return current.getKey(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public int docFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public final DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + // nocommit validate flags? + return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads); + } + + @Override + public final DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + // nocommit validate flags? + return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads); + } + } + // TODO maybe instead of @BeforeClass just make a single test run: build postings & index & test it? private FieldInfos currentFieldInfos; @@ -489,6 +693,12 @@ SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, null, new IOContext(new FlushInfo(maxDoc, bytes))); + + Fields seedFields = new SeedFields(fields, newFieldInfos, maxAllowed, allowPayloads); + + codec.postingsFormat().write(seedFields, writeState); + + /* FieldsConsumer fieldsConsumer = codec.postingsFormat().fieldsConsumer(writeState); for(Map.Entry> fieldEnt : fields.entrySet()) { @@ -562,7 +772,10 @@ } fieldsConsumer.close(); + */ + + if (VERBOSE) { System.out.println("TEST: after indexing: files="); for(String file : dir.listAll()) { @@ -625,7 +838,8 @@ SeedPostings expected = getSeedPostings(term.utf8ToString(), fields.get(field).get(term), useLiveDocs, - maxIndexOptions); + maxIndexOptions, + true); assertEquals(expected.docFreq, termsEnum.docFreq()); boolean allowFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 && Index: lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPostingsFormat.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPostingsFormat.java (revision 1516845) +++ lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPostingsFormat.java (working copy) @@ -22,9 +22,11 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.util.IOUtils; /** For debugging, curiosity, transparency only!! Do not * use this codec in production. @@ -41,8 +43,25 @@ } @Override + public void write(Fields fields, SegmentWriteState state) throws IOException { + SimpleTextFieldsWriter writer = new SimpleTextFieldsWriter(state); + boolean success = false; + try { + writer.write(state.fieldInfos, fields); + success = true; + } finally { + if (success) { + IOUtils.close(writer); + } else { + IOUtils.closeWhileHandlingException(writer); + } + } + } + + /** Writes a new segment */ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new SimpleTextFieldsWriter(state); + // nocommit can we "detect" this? ie you must override one of write or fieldsConsumer + throw new UnsupportedOperationException("SimpleText doesn't support FieldsConsumer"); } @Override Index: lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java =================================================================== --- lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java (revision 1516845) +++ lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsWriter.java (working copy) @@ -17,19 +17,25 @@ * limitations under the License. */ -import org.apache.lucene.util.BytesRef; +import java.io.IOException; +import java.util.Comparator; + import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsConsumer; import org.apache.lucene.codecs.TermStats; import org.apache.lucene.codecs.TermsConsumer; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; -import java.io.IOException; -import java.util.Comparator; - class SimpleTextFieldsWriter extends FieldsConsumer { private final IndexOutput out; @@ -50,6 +56,134 @@ out = state.directory.createOutput(fileName, state.context); } + public void write(FieldInfos fieldInfos, Fields fields) throws IOException { + + // for each field + for(String field : fields) { + Terms terms = fields.terms(field); + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + + // nocommit paranoia? + if (terms != null) { + boolean wroteField = false; + + boolean hasPositions = terms.hasPositions(); + + // nocommit shouldn't we add hasFreqs to Terms? + // then we don't need FieldInfos here + boolean hasFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_ONLY) > 0; + boolean hasOffsets = terms.hasOffsets(); + + int flags = 0; + if (hasPositions) { + flags = DocsAndPositionsEnum.FLAG_PAYLOADS; + if (hasOffsets) { + flags |= DocsAndPositionsEnum.FLAG_OFFSETS; + } + } else { + if (hasFreqs) { + flags = DocsEnum.FLAG_FREQS; + } + } + + TermsEnum termsEnum = terms.iterator(null); + DocsAndPositionsEnum posEnum = null; + DocsEnum docsEnum = null; + + // for each term in field + while(true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + if (hasPositions) { + posEnum = termsEnum.docsAndPositions(null, posEnum, flags); + docsEnum = posEnum; + } else { + docsEnum = termsEnum.docs(null, docsEnum, flags); + } + + boolean wroteTerm = false; + + // for each doc in field+term + while(true) { + int doc = docsEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + + if (!wroteTerm) { + + if (!wroteField) { + // we lazily do this, in case the field had + // no terms + write(FIELD); + write(field); + newline(); + wroteField = true; + } + + // we lazily do this, in case the term had + // zero docs + write(TERM); + write(term); + newline(); + wroteTerm = true; + } + + write(DOC); + write(Integer.toString(doc)); + newline(); + if (hasFreqs) { + int freq = docsEnum.freq(); + write(FREQ); + write(Integer.toString(freq)); + newline(); + + if (hasPositions) { + // for assert: + int lastStartOffset = 0; + + // for each pos in field+term+doc + for(int i=0;i= startOffset; + assert startOffset >= lastStartOffset: "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset; + lastStartOffset = startOffset; + write(START_OFFSET); + write(Integer.toString(startOffset)); + newline(); + write(END_OFFSET); + write(Integer.toString(endOffset)); + newline(); + } + + BytesRef payload = posEnum.getPayload(); + + if (payload != null && payload.length > 0) { + assert payload.length != 0; + write(PAYLOAD); + write(payload); + newline(); + } + } + } + } + } + } + } + } + } + private void write(String s) throws IOException { SimpleTextUtil.write(out, s, scratch); } Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (revision 1516845) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (working copy) @@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.lucene.analysis.*; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -537,21 +538,15 @@ public void eval(MockDirectoryWrapper dir) throws IOException { if (doFail) { StackTraceElement[] trace = new Exception().getStackTrace(); - boolean sawAppend = false; boolean sawFlush = false; for (int i = 0; i < trace.length; i++) { - if (sawAppend && sawFlush) { - break; - } - if (FreqProxTermsWriterPerField.class.getName().equals(trace[i].getClassName()) && "flush".equals(trace[i].getMethodName())) { - sawAppend = true; - } if ("flush".equals(trace[i].getMethodName())) { sawFlush = true; + break; } } - if (sawAppend && sawFlush && count++ >= 30) { + if (sawFlush && count++ >= 30) { doFail = false; throw new IOException("now failing during flush"); } Index: lucene/core/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestCodecs.java (revision 1516845) +++ lucene/core/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -19,8 +19,10 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; +import java.util.Iterator; import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; @@ -44,6 +46,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Constants; import org.apache.lucene.util.InfoStream; @@ -653,18 +656,267 @@ } } + private static class DataFields extends Fields { + private final FieldData[] fields; + + public DataFields(FieldData[] fields) { + // already sorted: + this.fields = fields; + } + + @Override + public Iterator iterator() { + return new Iterator() { + int upto = -1; + + @Override + public boolean hasNext() { + return upto+1 < fields.length; + } + + @Override + public String next() { + upto++; + return fields[upto].fieldInfo.name; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Override + public Terms terms(String field) { + // Slow linear search: + for(FieldData fieldData : fields) { + if (fieldData.fieldInfo.name.equals(field)) { + return new DataTerms(fieldData); + } + } + return null; + } + + @Override + public int size() { + return fields.length; + } + } + + private static class DataTerms extends Terms { + final FieldData fieldData; + + public DataTerms(FieldData fieldData) { + this.fieldData = fieldData; + } + + @Override + public TermsEnum iterator(TermsEnum reuse) { + return new DataTermsEnum(fieldData); + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public long size() { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumTotalTermFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumDocFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public int getDocCount() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasOffsets() { + return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldData.fieldInfo.hasPayloads(); + } + } + + private static class DataTermsEnum extends TermsEnum { + final FieldData fieldData; + private int upto = -1; + + public DataTermsEnum(FieldData fieldData) { + this.fieldData = fieldData; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public BytesRef next() { + upto++; + if (upto == fieldData.terms.length) { + return null; + } + + return term(); + } + + @Override + public BytesRef term() { + return fieldData.terms[upto].text; + } + + @Override + public SeekStatus seekCeil(BytesRef text) { + // Stupid linear impl: + for(int i=0;i 0) { + upto = i; + return SeekStatus.NOT_FOUND; + } + } + + return SeekStatus.END; + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public int docFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() { + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) { + assert liveDocs == null; + return new DataDocsAndPositionsEnum(fieldData.terms[upto]); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { + assert liveDocs == null; + return new DataDocsAndPositionsEnum(fieldData.terms[upto]); + } + } + + private static class DataDocsAndPositionsEnum extends DocsAndPositionsEnum { + final TermData termData; + int docUpto = -1; + int posUpto; + + public DataDocsAndPositionsEnum(TermData termData) { + this.termData = termData; + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() { + docUpto++; + if (docUpto == termData.docs.length) { + return NO_MORE_DOCS; + } + posUpto = -1; + return docID(); + } + + @Override + public int docID() { + return termData.docs[docUpto]; + } + + @Override + public int advance(int target) { + // Slow linear impl: + nextDoc(); + while (docID() < target) { + nextDoc(); + } + + return docID(); + } + + @Override + public int freq() { + return termData.positions[docUpto].length; + } + + @Override + public int nextPosition() { + posUpto++; + return termData.positions[docUpto][posUpto].pos; + } + + @Override + public BytesRef getPayload() { + return termData.positions[docUpto][posUpto].payload; + } + + @Override + public int startOffset() { + throw new UnsupportedOperationException(); + } + + @Override + public int endOffset() { + throw new UnsupportedOperationException(); + } + } + private void write(final FieldInfos fieldInfos, final Directory dir, final FieldData[] fields) throws Throwable { final Codec codec = Codec.getDefault(); final SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null, null); final SegmentWriteState state = new SegmentWriteState(InfoStream.getDefault(), dir, si, fieldInfos, null, newIOContext(random())); + Arrays.sort(fields); + /* final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(state); - Arrays.sort(fields); for (final FieldData field : fields) { field.write(consumer); } consumer.close(); + */ + codec.postingsFormat().write(new DataFields(fields), state); } public void testDocsOnlyFreq() throws Exception { Index: lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (revision 1516845) +++ lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (working copy) @@ -199,6 +199,9 @@ } if (random().nextInt(6) == 3) { + if (VERBOSE) { + System.out.println(" check positions"); + } final int freq = postings.freq(); assertTrue(freq >=1 && freq <= 4); for(int pos=0;pos @@ -64,93 +69,100 @@ * segment suffix name for each field. */ public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix"; - /** Sole constructor. */ public PerFieldPostingsFormat() { super(PER_FIELD_NAME); } + /* @Override public final FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new FieldsWriter(state); } - - static class FieldsConsumerAndSuffix implements Closeable { - FieldsConsumer consumer; + */ + + /** Group of fields written by one PostingsFormat */ + static class FieldsGroup { + final Set fields = new TreeSet(); int suffix; - - @Override - public void close() throws IOException { - consumer.close(); - } - } - - private class FieldsWriter extends FieldsConsumer { - private final Map formats = new HashMap(); - private final Map suffixes = new HashMap(); - - private final SegmentWriteState segmentWriteState; + /** Custom SegmentWriteState for this group of fields, + * with the segmentSuffix uniqueified for this + * PostingsFormat */ + SegmentWriteState state; + }; - public FieldsWriter(SegmentWriteState state) { - segmentWriteState = state; - } + @Override + public void write(Fields fields, SegmentWriteState state) throws IOException { - @Override - public TermsConsumer addField(FieldInfo field) throws IOException { - final PostingsFormat format = getPostingsFormatForField(field.name); + // Maps a PostingsFormat instance to the suffix it + // should use + Map formatToGroups = new HashMap(); + + // Holds last suffix of each PostingFormat name + Map suffixes = new HashMap(); + + // First pass: assign field -> PostingsFormat + for(String field : fields) { + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + + final PostingsFormat format = getPostingsFormatForField(field); + if (format == null) { - throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\""); + throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field + "\""); } - final String formatName = format.getName(); + String formatName = format.getName(); - String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName); - assert previousValue == null; - - Integer suffix; - - FieldsConsumerAndSuffix consumer = formats.get(format); - if (consumer == null) { - // First time we are seeing this format; create a new instance - + FieldsGroup group = formatToGroups.get(format); + if (group == null) { + // First time we are seeing this format; create a + // new instance + // bump the suffix - suffix = suffixes.get(formatName); + Integer suffix = suffixes.get(formatName); if (suffix == null) { suffix = 0; } else { suffix = suffix + 1; } suffixes.put(formatName, suffix); - - final String segmentSuffix = getFullSegmentSuffix(field.name, - segmentWriteState.segmentSuffix, - getSuffix(formatName, Integer.toString(suffix))); - consumer = new FieldsConsumerAndSuffix(); - consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)); - consumer.suffix = suffix; - formats.put(format, consumer); + + String segmentSuffix = getFullSegmentSuffix(field, + state.segmentSuffix, + getSuffix(formatName, Integer.toString(suffix))); + group = new FieldsGroup(); + group.state = new SegmentWriteState(state, segmentSuffix); + group.suffix = suffix; + formatToGroups.put(format, group); } else { // we've already seen this format, so just grab its suffix assert suffixes.containsKey(formatName); - suffix = consumer.suffix; } - - previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix)); + + group.fields.add(field); + + String previousValue = fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName); assert previousValue == null; - // TODO: we should only provide the "slice" of FIS - // that this PF actually sees ... then stuff like - // .hasProx could work correctly? - // NOTE: .hasProx is already broken in the same way for the non-perfield case, - // if there is a fieldinfo with prox that has no postings, you get a 0 byte file. - return consumer.consumer.addField(field); + previousValue = fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix)); + assert previousValue == null; } - @Override - public void close() throws IOException { - // Close all subs - IOUtils.close(formats.values()); + // Second pass: write postings + for(Map.Entry ent : formatToGroups.entrySet()) { + PostingsFormat format = ent.getKey(); + final FieldsGroup group = ent.getValue(); + + // Exposes only the fields from this group: + Fields maskedFields = new FilterFields(fields) { + @Override + public Iterator iterator() { + return group.fields.iterator(); + } + }; + + format.write(maskedFields, group.state); } } Index: lucene/core/src/java/org/apache/lucene/codecs/MappedMultiFields.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/MappedMultiFields.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/codecs/MappedMultiFields.java (working copy) @@ -0,0 +1,156 @@ +package org.apache.lucene.codecs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// nocommit move this and the other Mapped* into oal.index? +// then keep them all package private... + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.MultiDocsAndPositionsEnum; +import org.apache.lucene.index.MultiDocsEnum; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; +import org.apache.lucene.index.MultiTermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.Bits; + +import static org.apache.lucene.index.FilterAtomicReader.FilterFields; +import static org.apache.lucene.index.FilterAtomicReader.FilterTerms; +import static org.apache.lucene.index.FilterAtomicReader.FilterTermsEnum; + +/** A {@link Fields} implementation that merges multiple + * Fields into one, and maps around deleted documents. + * This is used for merging. */ + +// nocommit use FilterAtomicReader.Filter*? +// nocommit how to get checkAbort in here... + +public class MappedMultiFields extends FilterFields { + final MergeState mergeState; + + public MappedMultiFields(MergeState mergeState, MultiFields multiFields) { + super(multiFields); + this.mergeState = mergeState; + } + + @Override + public Terms terms(String field) throws IOException { + MultiTerms terms = (MultiTerms) in.terms(field); + if (terms == null) { + return null; + } else { + return new MappedMultiTerms(mergeState, terms); + } + } + + private static class MappedMultiTerms extends FilterTerms { + final MergeState mergeState; + + public MappedMultiTerms(MergeState mergeState, MultiTerms multiTerms) { + super(multiTerms); + this.mergeState = mergeState; + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return new MappedMultiTermsEnum(mergeState, (MultiTermsEnum) in.iterator(reuse)); + } + + @Override + public long size() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumTotalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumDocFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getDocCount() throws IOException { + throw new UnsupportedOperationException(); + } + } + + private static class MappedMultiTermsEnum extends FilterTermsEnum { + final MergeState mergeState; + + public MappedMultiTermsEnum(MergeState mergeState, MultiTermsEnum multiTermsEnum) { + super(multiTermsEnum); + this.mergeState = mergeState; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + MappingMultiDocsEnum mappingDocsEnum; + if (reuse instanceof MappingMultiDocsEnum) { + mappingDocsEnum = (MappingMultiDocsEnum) reuse; + } else { + mappingDocsEnum = new MappingMultiDocsEnum(); + // nocommit why not pass to ctor? + mappingDocsEnum.setMergeState(mergeState); + } + + MultiDocsEnum docsEnum = (MultiDocsEnum) in.docs(liveDocs, mappingDocsEnum.multiDocsEnum, flags); + mappingDocsEnum.reset(docsEnum); + return mappingDocsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + MappingMultiDocsAndPositionsEnum mappingDocsAndPositionsEnum; + if (reuse instanceof MappingMultiDocsAndPositionsEnum) { + mappingDocsAndPositionsEnum = (MappingMultiDocsAndPositionsEnum) reuse; + } else { + mappingDocsAndPositionsEnum = new MappingMultiDocsAndPositionsEnum(); + // nocommit why not pass to ctor? + mappingDocsAndPositionsEnum.setMergeState(mergeState); + } + + MultiDocsAndPositionsEnum docsAndPositionsEnum = (MultiDocsAndPositionsEnum) in.docsAndPositions(liveDocs, mappingDocsAndPositionsEnum.multiDocsAndPositionsEnum, flags); + mappingDocsAndPositionsEnum.reset(docsAndPositionsEnum); + return mappingDocsAndPositionsEnum; + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/codecs/MappedMultiFields.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (working copy) @@ -22,8 +22,18 @@ import java.util.Set; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; // javadocs +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.NamedSPILoader; /** @@ -71,9 +81,137 @@ public final String getName() { return name; } + + // nocommit are stats ok on exceptions + + // nocommit is it bad that an PF can no longer "easily" customize merge....? + + // nocommit javadoc the scary stuff + // - you need to "lazy write" your field? in case there were no terms + // - you need to "lazy write" your term? in case there were no docs + // - you need to compute stats (or, can we make some sugar for this ...) + + // nocommit ... what to do if the caller passes non-null liveDocs? + + // nocommit this code is temporary: fix all codecs to impl write then make this abstract & remove fieldsConsumer method: + // nocommit but ... how can we avoid requiring all codecs to compute the stats? + // nocommit maybe: you ARE allowed to call e.g. DE.docFreq, but only after you fully iterated through it? same for the other stats... problem is what if youiterate twice, e.g. pulsing or bitset wrapped PBF? + + public void write(Fields fields, SegmentWriteState state) throws IOException { + FieldsConsumer fieldsConsumer = fieldsConsumer(state); + + boolean success = false; + try { + for(String field : fields) { // for all fields + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + IndexOptions indexOptions = fieldInfo.getIndexOptions(); + TermsConsumer termsConsumer = fieldsConsumer.addField(fieldInfo); + Terms terms = fields.terms(field); + if (terms != null) { + + // Holds all docs that have this field: + FixedBitSet visitedDocs = new FixedBitSet(state.segmentInfo.getDocCount()); + + boolean hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + assert hasPositions == terms.hasPositions(); + boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + assert hasOffsets == terms.hasOffsets(); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + + int flags = 0; + if (hasPositions == false) { + if (hasFreq) { + flags |= DocsEnum.FLAG_FREQS; + } + } else { + flags = DocsAndPositionsEnum.FLAG_PAYLOADS; + if (hasOffsets) { + flags |= DocsAndPositionsEnum.FLAG_OFFSETS; + } + } + + DocsEnum docsEnum = null; + DocsAndPositionsEnum docsAndPositionsEnum = null; + TermsEnum termsEnum = terms.iterator(null); + + while (true) { // for all terms in this field + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + if (hasPositions) { + docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum, flags); + docsEnum = docsAndPositionsEnum; + } else { + docsEnum = termsEnum.docs(null, docsEnum, flags); + docsAndPositionsEnum = null; + } + + PostingsConsumer postingsConsumer = termsConsumer.startTerm(term); + + // How many documents have this term: + int docFreq = 0; + + // How many times this term occurs: + long totalTermFreq = 0; + + while(true) { // for all docs in this field+term + int doc = docsEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + docFreq++; + visitedDocs.set(doc); + if (hasFreq) { + int freq = docsEnum.freq(); + postingsConsumer.startDoc(doc, freq); + totalTermFreq += freq; + + if (hasPositions) { + for(int i=0;i 0) { + termsConsumer.finishTerm(term, new TermStats(docFreq, hasFreq ? totalTermFreq : -1)); + sumTotalTermFreq += totalTermFreq; + sumDocFreq += docFreq; + } + } + + termsConsumer.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality()); + } + } + success = true; + } finally { + if (success) { + IOUtils.close(fieldsConsumer); + } else { + IOUtils.closeWhileHandlingException(fieldsConsumer); + } + } + } /** Writes a new segment */ - public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // nocommit can we "detect" this? ie you must override one of write or fieldsConsumer + throw new UnsupportedOperationException(); + } /** Reads a segment. NOTE: by the time this call * returns, it must hold open any files it will need to Index: lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (working copy) @@ -70,79 +70,4 @@ /** Called when we are done adding positions & payloads * for each doc. */ public abstract void finishDoc() throws IOException; - - /** Default merge impl: append documents, mapping around - * deletes */ - public TermStats merge(final MergeState mergeState, IndexOptions indexOptions, final DocsEnum postings, final FixedBitSet visitedDocs) throws IOException { - - int df = 0; - long totTF = 0; - - if (indexOptions == IndexOptions.DOCS_ONLY) { - while(true) { - final int doc = postings.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - this.startDoc(doc, -1); - this.finishDoc(); - df++; - } - totTF = -1; - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { - while(true) { - final int doc = postings.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - final int freq = postings.freq(); - this.startDoc(doc, freq); - this.finishDoc(); - df++; - totTF += freq; - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; - while(true) { - final int doc = postingsEnum.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - visitedDocs.set(doc); - final int freq = postingsEnum.freq(); - this.startDoc(doc, freq); - totTF += freq; - for(int i=0;i getComparator() throws IOException; - - private MappingMultiDocsEnum docsEnum; - private MappingMultiDocsEnum docsAndFreqsEnum; - private MappingMultiDocsAndPositionsEnum postingsEnum; - - /** Default merge impl */ - public void merge(MergeState mergeState, IndexOptions indexOptions, TermsEnum termsEnum) throws IOException { - - BytesRef term; - assert termsEnum != null; - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - long sumDFsinceLastAbortCheck = 0; - FixedBitSet visitedDocs = new FixedBitSet(mergeState.segmentInfo.getDocCount()); - - if (indexOptions == IndexOptions.DOCS_ONLY) { - if (docsEnum == null) { - docsEnum = new MappingMultiDocsEnum(); - } - docsEnum.setMergeState(mergeState); - - MultiDocsEnum docsEnumIn = null; - - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - docsEnumIn = (MultiDocsEnum) termsEnum.docs(null, docsEnumIn, DocsEnum.FLAG_NONE); - if (docsEnumIn != null) { - docsEnum.reset(docsEnumIn); - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, docsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.docFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { - if (docsAndFreqsEnum == null) { - docsAndFreqsEnum = new MappingMultiDocsEnum(); - } - docsAndFreqsEnum.setMergeState(mergeState); - - MultiDocsEnum docsAndFreqsEnumIn = null; - - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - docsAndFreqsEnumIn = (MultiDocsEnum) termsEnum.docs(null, docsAndFreqsEnumIn); - assert docsAndFreqsEnumIn != null; - docsAndFreqsEnum.reset(docsAndFreqsEnumIn); - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, docsAndFreqsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { - if (postingsEnum == null) { - postingsEnum = new MappingMultiDocsAndPositionsEnum(); - } - postingsEnum.setMergeState(mergeState); - MultiDocsAndPositionsEnum postingsEnumIn = null; - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS); - assert postingsEnumIn != null; - postingsEnum.reset(postingsEnumIn); - - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, postingsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } else { - assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; - if (postingsEnum == null) { - postingsEnum = new MappingMultiDocsAndPositionsEnum(); - } - postingsEnum.setMergeState(mergeState); - MultiDocsAndPositionsEnum postingsEnumIn = null; - while((term = termsEnum.next()) != null) { - // We can pass null for liveDocs, because the - // mapping enum will skip the non-live docs: - postingsEnumIn = (MultiDocsAndPositionsEnum) termsEnum.docsAndPositions(null, postingsEnumIn); - assert postingsEnumIn != null; - postingsEnum.reset(postingsEnumIn); - - final PostingsConsumer postingsConsumer = startTerm(term); - final TermStats stats = postingsConsumer.merge(mergeState, indexOptions, postingsEnum, visitedDocs); - if (stats.docFreq > 0) { - finishTerm(term, stats); - sumTotalTermFreq += stats.totalTermFreq; - sumDFsinceLastAbortCheck += stats.docFreq; - sumDocFreq += stats.docFreq; - if (sumDFsinceLastAbortCheck > 60000) { - mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); - sumDFsinceLastAbortCheck = 0; - } - } - } - } - finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); - } } Index: lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsAndPositionsEnum.java (working copy) @@ -41,6 +41,7 @@ int currentBase; int doc = -1; private MergeState mergeState; + MultiDocsAndPositionsEnum multiDocsAndPositionsEnum; /** Sole constructor. */ public MappingMultiDocsAndPositionsEnum() { @@ -51,6 +52,7 @@ this.subs = postingsEnum.getSubs(); upto = -1; current = null; + this.multiDocsAndPositionsEnum = postingsEnum; return this; } Index: lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/codecs/MappingMultiDocsEnum.java (working copy) @@ -40,6 +40,7 @@ int currentBase; int doc = -1; private MergeState mergeState; + MultiDocsEnum multiDocsEnum; /** Sole constructor. */ public MappingMultiDocsEnum() { @@ -48,6 +49,7 @@ MappingMultiDocsEnum reset(MultiDocsEnum docsEnum) { this.numSubs = docsEnum.getNumSubs(); this.subs = docsEnum.getSubs(); + this.multiDocsEnum = docsEnum; upto = -1; current = null; return this; Index: lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -108,7 +108,7 @@ /** Collapse the hash table & sort in-place. */ public int[] sortPostings(Comparator termComp) { - return bytesHash.sort(termComp); + return bytesHash.sort(termComp); } private boolean doCall; @@ -136,7 +136,8 @@ // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into - // textStart, so we hash by textStart + // textStart, so we hash by textStart. term vectors use + // this API. public void add(int textStart) throws IOException { int termID = bytesHash.addByPoolOffset(textStart); if (termID >= 0) { // New posting @@ -173,7 +174,8 @@ } } - // Primary entry point (for first TermsHash) + // Primary entry point (for first TermsHash); postings use + // this API. @Override void add() throws IOException { Index: lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (working copy) @@ -0,0 +1,562 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** Implements {@link Fields} interface over the in-RAM + * buffered fields/terms/postings, to flush postings + * through the PostingsFormat. */ + +// nocommit should we factor out an "only the iterables" +// part of Fields/Terms/*Enums? ie, instead of throwing exc +// from all stats? we can't really support the stats +// because they'd require an extra pass during merging +// ... so this is really just "iterables" + +class FreqProxFields extends Fields { + final Map fields = new LinkedHashMap(); + + public FreqProxFields(List fieldList) { + // NOTE: fields are already sorted by field name + for(FreqProxTermsWriterPerField field : fieldList) { + fields.put(field.fieldInfo.name, field); + } + } + + public Iterator iterator() { + return fields.keySet().iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + FreqProxTermsWriterPerField perField = fields.get(field); + return perField == null ? null : new FreqProxTerms(perField); + } + + @Override + public int size() { + // nocommit not ... safe? what if one of these fields + // has no terms? + //return fields.size(); + throw new UnsupportedOperationException(); + } + + private static class FreqProxTerms extends Terms { + final FreqProxTermsWriterPerField terms; + + public FreqProxTerms(FreqProxTermsWriterPerField terms) { + this.terms = terms; + } + + @Override + public TermsEnum iterator(TermsEnum reuse) { + FreqProxTermsEnum termsEnum; + if (reuse instanceof FreqProxTermsEnum && ((FreqProxTermsEnum) reuse).terms == this.terms) { + termsEnum = (FreqProxTermsEnum) reuse; + } else { + termsEnum = new FreqProxTermsEnum(terms); + } + termsEnum.reset(); + return termsEnum; + } + + // nocommit in old TermsConsumer API, consumer could + // override comparator ... but I think we just nuke/lose + // this. + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public long size() { + // nocommit throw exc instead? merging cannot do this: + // nocommit not ... safe? what if one of these terms + // has no docs? + //return terms.termsHashPerField.bytesHash.size(); + throw new UnsupportedOperationException(); + } + + @Override + public long getSumTotalTermFreq() { + // nocommit throw exc instead? merging cannot do this: + return terms.sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + // nocommit throw exc instead? merging cannot do this: + return terms.sumDocFreq; + } + + @Override + public int getDocCount() { + // nocommit throw exc instead? merging cannot do this: + return terms.docCount; + } + + @Override + public boolean hasOffsets() { + // NOTE: the in-memory buffer may have indexed offsets + // because that's what FieldInfo said when we started, + // but during indexing this may have been downgraded: + return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + // NOTE: the in-memory buffer may have indexed positions + // because that's what FieldInfo said when we started, + // but during indexing this may have been downgraded: + return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return terms.hasPayloads; + } + } + + private static class FreqProxTermsEnum extends TermsEnum { + final FreqProxTermsWriterPerField terms; + final int[] sortedTermIDs; + final FreqProxPostingsArray postingsArray; + final BytesRef scratch = new BytesRef(); + BytesRef currentTerm; + final int numTerms; + int ord; + + public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) { + this.terms = terms; + this.numTerms = terms.termsHashPerField.bytesHash.size(); + sortedTermIDs = terms.sortedTermIDs; + assert sortedTermIDs != null; + postingsArray = (FreqProxPostingsArray) terms.termsHashPerField.postingsArray; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + public void reset() { + ord = -1; + } + + // nocommit impl seekExact(BytesRef)? + + public SeekStatus seekCeil(BytesRef text) { + + // nocommit we could also ... keep the BytesRefHash + // intact so this is a hash lookup + + // binary search: + int lo = 0; + int hi = numTerms - 1; + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int textStart = postingsArray.textStarts[sortedTermIDs[mid]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + int cmp = scratch.compareTo(text); + if (cmp < 0) { + lo = mid + 1; + } else if (cmp > 0) { + hi = mid - 1; + } else { + // found: + ord = mid; + currentTerm = scratch; + return SeekStatus.FOUND; + } + } + + // not found: + ord = lo + 1; + if (ord == numTerms) { + currentTerm = null; + return SeekStatus.END; + } else { + // nocommit do i need to do this again? loop may + // not end on this term? + //int textStart = postingsArray.textStarts[sortedTermIDs[mid]]; + //termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + return SeekStatus.NOT_FOUND; + } + } + + public void seekExact(long ord) { + this.ord = (int) ord; + int textStart = postingsArray.textStarts[sortedTermIDs[this.ord]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + } + + // nocommit seekExact(term, termState)? + + @Override + public BytesRef next() { + ord++; + if (ord >= numTerms) { + currentTerm = null; + } else { + int textStart = postingsArray.textStarts[sortedTermIDs[ord]]; + terms.termsHashPerField.bytePool.setBytesRef(scratch, textStart); + currentTerm = scratch; + } + return currentTerm; + } + + @Override + public BytesRef term() { + // nocommit can we just return scratch? + return currentTerm; + } + + @Override + public long ord() { + return ord; + } + + @Override + public int docFreq() { + // We do not store this per-term, and we cannot + // implement this at merge time w/o an added pass + // through the postings: + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() { + // We do not store this per-term, and we cannot + // implement this at merge time w/o an added pass + // through the postings: + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + + FreqProxDocsEnum docsEnum; + + if (!terms.hasFreq && (flags & DocsEnum.FLAG_FREQS) != 0) { + // Caller wants freqs but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index freq"); + } + + if (reuse instanceof FreqProxDocsEnum) { + docsEnum = (FreqProxDocsEnum) reuse; + if (docsEnum.postingsArray != postingsArray) { + docsEnum = new FreqProxDocsEnum(terms, postingsArray); + } + } else { + docsEnum = new FreqProxDocsEnum(terms, postingsArray); + } + docsEnum.reset(sortedTermIDs[ord]); + return docsEnum; + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) { + if (liveDocs != null) { + throw new IllegalArgumentException("liveDocs must be null"); + } + FreqProxDocsAndPositionsEnum posEnum; + + if (!terms.hasProx) { + // Caller wants positions but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index positions"); + } + + if (!terms.hasOffsets && (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0) { + // Caller wants offsets but we didn't index them; + // don't lie: + throw new IllegalArgumentException("did not index offsets"); + } + + if (reuse instanceof FreqProxDocsAndPositionsEnum) { + posEnum = (FreqProxDocsAndPositionsEnum) reuse; + if (posEnum.postingsArray != postingsArray) { + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); + } + } else { + posEnum = new FreqProxDocsAndPositionsEnum(terms, postingsArray); + } + posEnum.reset(sortedTermIDs[ord]); + return posEnum; + } + + /** + * Expert: Returns the TermsEnums internal state to position the TermsEnum + * without re-seeking the term dictionary. + *

+ * NOTE: A seek by {@link TermState} might not capture the + * {@link AttributeSource}'s state. Callers must maintain the + * {@link AttributeSource} states separately + * + * @see TermState + * @see #seekExact(BytesRef, TermState) + */ + public TermState termState() throws IOException { + return new TermState() { + @Override + public void copyFrom(TermState other) { + throw new UnsupportedOperationException(); + } + }; + } + } + + private static class FreqProxDocsEnum extends DocsEnum { + + final FreqProxTermsWriterPerField terms; + final FreqProxPostingsArray postingsArray; + final ByteSliceReader reader = new ByteSliceReader(); + final boolean readTermFreq; + int docID; + int freq; + boolean ended; + int termID; + + public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { + this.terms = terms; + this.postingsArray = postingsArray; + this.readTermFreq = terms.hasFreq; + } + + public void reset(int termID) { + this.termID = termID; + terms.termsHashPerField.initReader(reader, termID, 0); + ended = false; + docID = 0; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + // Don't lie here ... don't want codecs writings lots + // of wasted 1s into the index: + if (!readTermFreq) { + throw new IllegalStateException("freq was not indexed"); + } else { + return freq; + } + } + + @Override + public int nextDoc() throws IOException { + if (reader.eof()) { + if (ended) { + return NO_MORE_DOCS; + } else { + ended = true; + docID = postingsArray.lastDocIDs[termID]; + if (readTermFreq) { + freq = postingsArray.termFreqs[termID]; + } + } + } else { + int code = reader.readVInt(); + if (!readTermFreq) { + docID += code; + } else { + docID += code >>> 1; + if ((code & 1) != 0) { + freq = 1; + } else { + freq = reader.readVInt(); + } + } + + assert docID != postingsArray.lastDocIDs[termID]; + } + + return docID; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + } + + private static class FreqProxDocsAndPositionsEnum extends DocsAndPositionsEnum { + + final FreqProxTermsWriterPerField terms; + final FreqProxPostingsArray postingsArray; + final ByteSliceReader reader = new ByteSliceReader(); + final ByteSliceReader posReader = new ByteSliceReader(); + final boolean readOffsets; + int docID; + int freq; + int pos; + int startOffset; + int endOffset; + int posLeft; + int termID; + boolean ended; + boolean hasPayload; + BytesRef payload = new BytesRef(); + + public FreqProxDocsAndPositionsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) { + this.terms = terms; + this.postingsArray = postingsArray; + this.readOffsets = terms.hasOffsets; + assert terms.hasProx; + assert terms.hasFreq; + } + + public void reset(int termID) { + this.termID = termID; + terms.termsHashPerField.initReader(reader, termID, 0); + terms.termsHashPerField.initReader(posReader, termID, 1); + ended = false; + docID = 0; + posLeft = 0; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + return freq; + } + + @Override + public int nextDoc() throws IOException { + while (posLeft-- != 0) { + nextPosition(); + } + + if (reader.eof()) { + if (ended) { + return NO_MORE_DOCS; + } else { + ended = true; + docID = postingsArray.lastDocIDs[termID]; + freq = postingsArray.termFreqs[termID]; + } + } else { + int code = reader.readVInt(); + docID += code >>> 1; + if ((code & 1) != 0) { + freq = 1; + } else { + freq = reader.readVInt(); + } + + assert docID != postingsArray.lastDocIDs[termID]; + } + + posLeft = freq; + pos = 0; + startOffset = 0; + return docID; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextPosition() throws IOException { + assert posLeft > 0; + posLeft--; + int code = posReader.readVInt(); + pos += code >>> 1; + if ((code & 1) != 0) { + hasPayload = true; + // has a payload + payload.length = posReader.readVInt(); + if (payload.bytes.length < payload.length) { + payload.grow(payload.length); + } + posReader.readBytes(payload.bytes, 0, payload.length); + } else { + hasPayload = false; + } + + if (readOffsets) { + startOffset += posReader.readVInt(); + endOffset = startOffset + posReader.readVInt(); + } + + return pos; + } + + @Override + public int startOffset() { + if (!readOffsets) { + throw new IllegalStateException("offsets were not indexed"); + } + return startOffset; + } + + @Override + public int endOffset() { + if (!readOffsets) { + throw new IllegalStateException("offsets were not indexed"); + } + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (hasPayload) { + return payload; + } else { + return null; + } + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy) @@ -42,12 +42,17 @@ final FieldInfo fieldInfo; final DocumentsWriterPerThread.DocState docState; final FieldInvertState fieldState; - private boolean hasFreq; - private boolean hasProx; - private boolean hasOffsets; + boolean hasFreq; + boolean hasProx; + boolean hasOffsets; PayloadAttribute payloadAttribute; OffsetAttribute offsetAttribute; + long sumTotalTermFreq; + long sumDocFreq; + // How many docs have this field: + int docCount; + public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.parent = parent; @@ -68,6 +73,16 @@ @Override void finish() { + sumDocFreq += fieldState.uniqueTermCount; + sumTotalTermFreq += fieldState.length; + if (fieldState.length > 0) { + docCount++; + } + + // nocommit we don't expose "overlaps" in any statistic + // today ... should we? i guess we cannot in general + // since we don't save this in the index + if (hasPayloads) { fieldInfo.setStorePayloads(); } @@ -83,6 +98,8 @@ return fieldInfo.name.compareTo(other.fieldInfo.name); } + // nocommit is this really called? we don't reuse? + // Called after flush void reset() { // Record, up front, whether our in-RAM format will be @@ -318,233 +335,10 @@ BytesRef payload; - /* Walk through all unique text tokens (Posting - * instances) found in this field and serialize them - * into a single RAM segment. */ - void flush(String fieldName, FieldsConsumer consumer, final SegmentWriteState state) - throws IOException { + int[] sortedTermIDs; - if (!fieldInfo.isIndexed()) { - return; // nothing to flush, don't bother the codec with the unindexed field - } - - final TermsConsumer termsConsumer = consumer.addField(fieldInfo); - final Comparator termComp = termsConsumer.getComparator(); - - // CONFUSING: this.indexOptions holds the index options - // that were current when we first saw this field. But - // it's possible this has changed, eg when other - // documents are indexed that cause a "downgrade" of the - // IndexOptions. So we must decode the in-RAM buffer - // according to this.indexOptions, but then write the - // new segment to the directory according to - // currentFieldIndexOptions: - final IndexOptions currentFieldIndexOptions = fieldInfo.getIndexOptions(); - assert currentFieldIndexOptions != null; - - final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - - final boolean readTermFreq = this.hasFreq; - final boolean readPositions = this.hasProx; - final boolean readOffsets = this.hasOffsets; - - //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); - - // Make sure FieldInfo.update is working correctly!: - assert !writeTermFreq || readTermFreq; - assert !writePositions || readPositions; - assert !writeOffsets || readOffsets; - - assert !writeOffsets || writePositions; - - final Map segDeletes; - if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { - segDeletes = state.segDeletes.terms; - } else { - segDeletes = null; - } - - final int[] termIDs = termsHashPerField.sortPostings(termComp); - final int numTerms = termsHashPerField.bytesHash.size(); - final BytesRef text = new BytesRef(); - final FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray; - final ByteSliceReader freq = new ByteSliceReader(); - final ByteSliceReader prox = new ByteSliceReader(); - - FixedBitSet visitedDocs = new FixedBitSet(state.segmentInfo.getDocCount()); - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - - Term protoTerm = new Term(fieldName); - for (int i = 0; i < numTerms; i++) { - final int termID = termIDs[i]; - //System.out.println("term=" + termID); - // Get BytesRef - final int textStart = postings.textStarts[termID]; - termsHashPerField.bytePool.setBytesRef(text, textStart); - - termsHashPerField.initReader(freq, termID, 0); - if (readPositions || readOffsets) { - termsHashPerField.initReader(prox, termID, 1); - } - - // TODO: really TermsHashPerField should take over most - // of this loop, including merge sort of terms from - // multiple threads and interacting with the - // TermsConsumer, only calling out to us (passing us the - // DocsConsumer) to handle delivery of docs/positions - - final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); - - final int delDocLimit; - if (segDeletes != null) { - protoTerm.bytes = text; - final Integer docIDUpto = segDeletes.get(protoTerm); - if (docIDUpto != null) { - delDocLimit = docIDUpto; - } else { - delDocLimit = 0; - } - } else { - delDocLimit = 0; - } - - // Now termStates has numToMerge FieldMergeStates - // which all share the same term. Now we must - // interleave the docID streams. - int docFreq = 0; - long totalTermFreq = 0; - int docID = 0; - - while(true) { - //System.out.println(" cycle"); - final int termFreq; - if (freq.eof()) { - if (postings.lastDocCodes[termID] != -1) { - // Return last doc - docID = postings.lastDocIDs[termID]; - if (readTermFreq) { - termFreq = postings.termFreqs[termID]; - } else { - termFreq = -1; - } - postings.lastDocCodes[termID] = -1; - } else { - // EOF - break; - } - } else { - final int code = freq.readVInt(); - if (!readTermFreq) { - docID += code; - termFreq = -1; - } else { - docID += code >>> 1; - if ((code & 1) != 0) { - termFreq = 1; - } else { - termFreq = freq.readVInt(); - } - } - - assert docID != postings.lastDocIDs[termID]; - } - - docFreq++; - assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount(); - - // NOTE: we could check here if the docID was - // deleted, and skip it. However, this is somewhat - // dangerous because it can yield non-deterministic - // behavior since we may see the docID before we see - // the term that caused it to be deleted. This - // would mean some (but not all) of its postings may - // make it into the index, which'd alter the docFreq - // for those terms. We could fix this by doing two - // passes, ie first sweep marks all del docs, and - // 2nd sweep does the real flush, but I suspect - // that'd add too much time to flush. - visitedDocs.set(docID); - postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1); - if (docID < delDocLimit) { - // Mark it deleted. TODO: we could also skip - // writing its postings; this would be - // deterministic (just for this Term's docs). - - // TODO: can we do this reach-around in a cleaner way???? - if (state.liveDocs == null) { - state.liveDocs = docState.docWriter.codec.liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount()); - } - if (state.liveDocs.get(docID)) { - state.delCountOnFlush++; - state.liveDocs.clear(docID); - } - } - - totalTermFreq += termFreq; - - // Carefully copy over the prox + payload info, - // changing the format to match Lucene's segment - // format. - - if (readPositions || readOffsets) { - // we did record positions (& maybe payload) and/or offsets - int position = 0; - int offset = 0; - for(int j=0;j>> 1; - - if ((code & 1) != 0) { - - // This position has a payload - final int payloadLength = prox.readVInt(); - - if (payload == null) { - payload = new BytesRef(); - payload.bytes = new byte[payloadLength]; - } else if (payload.bytes.length < payloadLength) { - payload.grow(payloadLength); - } - - prox.readBytes(payload.bytes, 0, payloadLength); - payload.length = payloadLength; - thisPayload = payload; - - } else { - thisPayload = null; - } - - if (readOffsets) { - final int startOffset = offset + prox.readVInt(); - final int endOffset = startOffset + prox.readVInt(); - if (writePositions) { - if (writeOffsets) { - assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset; - postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset); - } else { - postingsConsumer.addPosition(position, thisPayload, -1, -1); - } - } - offset = startOffset; - } else if (writePositions) { - postingsConsumer.addPosition(position, thisPayload, -1, -1); - } - } - } - } - postingsConsumer.finishDoc(); - } - termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); - sumTotalTermFreq += totalTermFreq; - sumDocFreq += docFreq; - } - - termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality()); + void sortPostings(Comparator termComp) { + assert sortedTermIDs == null; + sortedTermIDs = termsHashPerField.sortPostings(termComp); } } Index: lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -22,9 +22,10 @@ import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.FieldInfosWriter; import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MappedMultiFields; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo.DocValuesType; @@ -375,7 +376,15 @@ docBase += maxDoc; } - final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState); + //final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState); + + Fields mergedFields = new MappedMultiFields(mergeState, + new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), + slices.toArray(ReaderSlice.EMPTY_ARRAY))); + + codec.postingsFormat().write(mergedFields, segmentWriteState); + + /* boolean success = false; try { consumer.merge(mergeState, @@ -389,5 +398,6 @@ IOUtils.closeWhileHandlingException(consumer); } } + */ } } Index: lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 1516845) +++ lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -19,6 +19,8 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Map; @@ -32,6 +34,50 @@ @Override void abort() {} + private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException { + // Process any pending Term deletes for this newly + // flushed segment: + if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { + Map segDeletes = state.segDeletes.terms; + List deleteTerms = new ArrayList(segDeletes.keySet()); + Collections.sort(deleteTerms); + String lastField = null; + TermsEnum termsEnum = null; + DocsEnum docsEnum = null; + for(Term deleteTerm : deleteTerms) { + if (deleteTerm.field().equals(lastField) == false) { + lastField = deleteTerm.field(); + Terms terms = fields.terms(lastField); + if (terms != null) { + termsEnum = terms.iterator(termsEnum); + } + } + + if (termsEnum != null && termsEnum.seekExact(deleteTerm.bytes())) { + docsEnum = termsEnum.docs(null, docsEnum, 0); + int delDocLimit = segDeletes.get(deleteTerm); + while (true) { + int doc = docsEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + if (doc < delDocLimit) { + if (state.liveDocs == null) { + state.liveDocs = state.segmentInfo.getCodec().liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount()); + } + if (state.liveDocs.get(doc)) { + state.delCountOnFlush++; + state.liveDocs.clear(doc); + } + } else { + break; + } + } + } + } + } + } + // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. @@ -44,9 +90,13 @@ // ThreadStates List allFields = new ArrayList(); + Comparator termComp = BytesRef.getUTF8SortedAsUnicodeComparator(); + for (TermsHashConsumerPerField f : fieldsToFlush.values()) { final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f; if (perField.termsHashPerField.bytesHash.size() > 0) { + perField.sortPostings(termComp); + assert perField.fieldInfo.isIndexed(); allFields.add(perField); } } @@ -56,53 +106,30 @@ // Sort by field name CollectionUtil.introSort(allFields); - final FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state); + Fields fields = new FreqProxFields(allFields); - boolean success = false; + applyDeletes(state, fields); - try { - TermsHash termsHash = null; + state.segmentInfo.getCodec().postingsFormat().write(fields, state); + + TermsHash termsHash = null; - /* - Current writer chain: - FieldsConsumer - -> IMPL: FormatPostingsTermsDictWriter - -> TermsConsumer - -> IMPL: FormatPostingsTermsDictWriter.TermsWriter - -> DocsConsumer - -> IMPL: FormatPostingsDocsWriter - -> PositionsConsumer - -> IMPL: FormatPostingsPositionsWriter - */ - - for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { - final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo; + for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { + final FieldInfo fieldInfo = allFields.get(fieldNumber).fieldInfo; - final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber); + final FreqProxTermsWriterPerField fieldWriter = allFields.get(fieldNumber); - // If this field has postings then add them to the - // segment - fieldWriter.flush(fieldInfo.name, consumer, state); - - TermsHashPerField perField = fieldWriter.termsHashPerField; - assert termsHash == null || termsHash == perField.termsHash; - termsHash = perField.termsHash; - int numPostings = perField.bytesHash.size(); - perField.reset(); - perField.shrinkHash(numPostings); - fieldWriter.reset(); - } + TermsHashPerField perField = fieldWriter.termsHashPerField; + assert termsHash == null || termsHash == perField.termsHash; + termsHash = perField.termsHash; + int numPostings = perField.bytesHash.size(); + perField.reset(); + perField.shrinkHash(numPostings); + fieldWriter.reset(); + } - if (termsHash != null) { - termsHash.reset(); - } - success = true; - } finally { - if (success) { - IOUtils.close(consumer); - } else { - IOUtils.closeWhileHandlingException(consumer); - } + if (termsHash != null) { + termsHash.reset(); } }