Index: src/test/org/apache/lucene/store/MockRAMInputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 718730) +++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy) @@ -1,7 +1,5 @@ package org.apache.lucene.store; -import java.io.IOException; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ * limitations under the License. */ +import java.io.IOException; + /** * Used by MockRAMDirectory to create an input stream that * keeps track of when it's been closed. @@ -44,16 +44,8 @@ // all clones get closed: if (!isClone) { synchronized(dir.openFiles) { - Integer v = (Integer) dir.openFiles.get(name); - // Could be null when MockRAMDirectory.crash() was called - if (v != null) { - if (v.intValue() == 1) { - dir.openFiles.remove(name); - } else { - v = new Integer(v.intValue()-1); - dir.openFiles.put(name, v); - } - } + assert dir.openFiles.containsKey(this): "input=" + name + " is not open"; + dir.openFiles.remove(this); } } } Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 718730) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -208,9 +208,11 @@ if (crashed) throw new IOException("cannot createOutput after crash"); init(); - synchronized(openFiles) { + synchronized(this) { if (preventDoubleWrite && createdFiles.contains(name) && !name.equals("segments.gen")) throw new IOException("file \"" + name + "\" was already written to"); + } + synchronized(openFiles) { if (noDeleteOpenFile && openFiles.containsKey(name)) throw new IOException("MockRAMDirectory: file \"" + name + "\" is still open: cannot overwrite"); } @@ -237,6 +239,15 @@ return new MockRAMOutputStream(this, file); } + static class OpenFile { + final String name; + final Throwable stack; + OpenFile(String name) { + this.name = name; + this.stack = new Throwable(); + } + } + public IndexInput openInput(String name) throws IOException { RAMFile file; synchronized (this) { @@ -245,17 +256,12 @@ if (file == null) throw new FileNotFoundException(name); else { + IndexInput in = new MockRAMInputStream(this, name, file); synchronized(openFiles) { - if (openFiles.containsKey(name)) { - Integer v = (Integer) openFiles.get(name); - v = new Integer(v.intValue()+1); - openFiles.put(name, v); - } else { - openFiles.put(name, new Integer(1)); - } + openFiles.put(in, new OpenFile(name)); } + return in; } - return new MockRAMInputStream(this, name, file); } /** Provided for testing purposes. Use sizeInBytes() instead. */ @@ -289,7 +295,14 @@ if (noDeleteOpenFile && openFiles.size() > 0) { // RuntimeException instead of IOException because // super() does not throw IOException currently: - throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles); + Iterator it = openFiles.values().iterator(); + System.out.println("\nMockRAMDirectory open files:"); + while(it.hasNext()) { + OpenFile openFile = (OpenFile) it.next(); + System.out.println("\nfile " + openFile.name + " opened from:\n"); + openFile.stack.printStackTrace(System.out); + } + throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files"); } } } Index: src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 718730) +++ src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -65,23 +65,6 @@ verifyDocFreq(); } - public void testPrevTermAtEnd() throws IOException - { - Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - addDoc(writer, "aaa bbb"); - writer.close(); - IndexReader reader = IndexReader.open(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); - } - private void verifyDocFreq() throws IOException { Index: src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexReader.java (revision 718730) +++ src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -870,15 +870,18 @@ d.add(new Field("id", Integer.toString(i), Field.Store.YES, Field.Index.NOT_ANALYZED)); d.add(new Field("content", "aaa " + i, Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(d); + if (0==i%10) + writer.commit(); } writer.close(); - long diskUsage = startDir.sizeInBytes(); - long diskFree = diskUsage+100; + long diskUsage = ((MockRAMDirectory) startDir).getRecomputedActualSizeInBytes(); + long diskFree = diskUsage+100; IOException err = null; boolean done = false; + boolean gotExc = false; // Iterate w/ ever increasing free disk space: while(!done) { @@ -935,7 +938,7 @@ int docId = 12; for(int i=0;i<13;i++) { reader.deleteDocument(docId); - reader.setNorm(docId, "contents", (float) 2.0); + reader.setNorm(docId, "content", (float) 2.0); docId += 12; } } @@ -950,6 +953,7 @@ e.printStackTrace(System.out); } err = e; + gotExc = true; if (1 == x) { e.printStackTrace(); fail(testName + " hit IOException after disk space was freed up"); @@ -965,7 +969,7 @@ String[] startFiles = dir.list(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null, PostingsCodec.getCodec()); String[] endFiles = dir.list(); Arrays.sort(startFiles); @@ -1039,6 +1043,8 @@ newReader.close(); if (result2 == END_COUNT) { + if (!gotExc) + fail("never hit disk full"); break; } } Index: src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 718730) +++ src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -291,12 +291,12 @@ if (!termEnum2.next()) break; } + assertEquals(len1, len2); + if (len1==0) break; // no more terms + if (!hasDeletes) assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); - assertEquals(len1, len2); - if (len1==0) break; // no more terms - assertEquals(term1, term2); // sort info2 to get it into ascending docid Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 718730) +++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -56,14 +56,14 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { - int docId = segTermDocs.doc(); + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -79,10 +79,10 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { @@ -90,10 +90,10 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 718730) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -542,7 +542,7 @@ String[] startFiles = dir.list(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null, PostingsCodec.getCodec()); String[] endFiles = dir.list(); Arrays.sort(startFiles); Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 718730) +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy) @@ -32,7 +32,8 @@ import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.store.Directory; /** * This testcase tests whether multi-level skipping is being used @@ -44,7 +45,7 @@ */ public class TestMultiLevelSkipList extends LuceneTestCase { public void testSimpleSkip() throws IOException { - RAMDirectory dir = new RAMDirectory(); + Directory dir = new CountingRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Term term = new Term("test", "a"); @@ -58,8 +59,7 @@ writer.close(); IndexReader reader = IndexReader.open(dir); - SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); - tp.freqStream = new CountingStream(tp.freqStream); + TermPositions tp = reader.termPositions(); for (int i = 0; i < 2; i++) { counter = 0; @@ -114,6 +114,15 @@ } + class CountingRAMDirectory extends MockRAMDirectory { + public IndexInput openInput(String fileName) throws IOException { + IndexInput in = super.openInput(fileName); + if (fileName.endsWith(".frq")) + in = new CountingStream(in); + return in; + } + } + private int counter = 0; // Simply extends IndexInput in a way that we are able to count the number Index: src/test/org/apache/lucene/index/TestFormatPostings.java =================================================================== --- src/test/org/apache/lucene/index/TestFormatPostings.java (revision 0) +++ src/test/org/apache/lucene/index/TestFormatPostings.java (revision 0) @@ -0,0 +1,432 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.*; +import org.apache.lucene.store.*; +import java.util.*; + +// TODO +// - make more docs per term, to test > 1 level skipping +// - test all combinations of payloads/not and omitTF/not +// - test w/ different indexDivisor +// - test field where payload length rarely changes +// - 0-term fields +// - seek/skip to same term/doc i'm already on +// - mix in deleted docs +// - seek, skip beyond end -- assert returns false +// - seek, skip to things that don't exist -- ensure it +// goes to 1 before next one known to exist +// - skipTo(term) +// - skipTo(doc) + +public class TestFormatPostings extends LuceneTestCase { + + private static final Random RANDOM = new Random(42); + private static String[] fieldNames = new String[] {"one", "two", "three", "four"}; + + private final static int NUM_TEST_ITER = 4000; + private final static int NUM_TEST_THREADS = 3; // nocommit + private final static int NUM_FIELDS = 4; + private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping + private final static int DOC_FREQ_RAND = 500; // must be > 16 to test skipping + private final static int TERM_DOC_FREQ_RAND = 20; + + // start is inclusive and end is exclusive + public int nextInt(int start, int end) { + return start + RANDOM.nextInt(end-start); + } + + private int nextInt(int lim) { + return RANDOM.nextInt(lim); + } + + private boolean nextBoolean() { + return 0 == nextInt(1); + } + + char[] getRandomText() { + + final int len = 1+nextInt(10); + char[] buffer = new char[len+1]; + for(int i=0;i=0;i--) { + if (PostingsCodec.DEBUG) + System.out.println(" TEST: term=" + field.terms[i].text2 + " has docFreq=" + field.terms[i].docs.length); + assertTrue(termsEnum.seek(field.terms[i].text2)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + } + + // Seek to non-existent empty-string term + assertFalse(termsEnum.seek("")); + + // Make sure we're now pointing to first term + assertEquals(termsEnum.text(), field.terms[0].text2); + + // Test docs enum + if (PostingsCodec.DEBUG) + System.out.println("\nTEST: docs/positions"); + termsEnum.seek(""); + upto = 0; + do { + if (nextInt(3) == 1) { + term = field.terms[upto]; + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: iterate docs..."); + DocsEnum docs = termsEnum.docs(null); + int upto2 = -1; + while(upto2 < term.docs.length-1) { + // Maybe skip: + final int left = term.docs.length-upto2; + if (nextInt(3) == 1 && left >= 1) { + int inc = 1+nextInt(left-1); + upto2 += inc; + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: dr.skip: " + left + " docs left; skip to doc=" + term.docs[upto2] + " [" + upto2 + " of " + term.docs.length + "]"); + assertTrue(docs.skip(term.docs[upto2])); + } else { + assertTrue(docs.next()); + upto2++; + } + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: got next doc..."); + assertEquals(term.docs[upto2], docs.doc()); + if (!field.omitTF) { + assertEquals(term.positions[upto2].length, docs.freq()); + if (nextInt(2) == 1) { + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: check positions for doc " + term.docs[upto2] + "..."); + verifyPositions(term.positions[upto2], docs.positions()); + } else if (PostingsCodec.DEBUG) + System.out.println("TEST: skip positions..."); + } else if (PostingsCodec.DEBUG) + System.out.println("TEST: skip positions: omitTF=true"); + } + + assertFalse(docs.next()); + + } else if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: skip docs"); + upto++; + + } while (termsEnum.next()); + + assertEquals(upto, field.terms.length); + + termsEnum.close(); + } + } + } + + private void write(FieldInfos fieldInfos, Directory dir, RandomField[] fields) throws Throwable { + + // nocommit -- randomize this: + final int termIndexInterval = 16; + + SegmentWriteState state = new SegmentWriteState(null, dir, SEGMENT, fieldInfos, null, 10000, 10000, termIndexInterval, + PostingsCodec.getCodec()); + + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + Arrays.sort(fields); + for(int i=0;i= 0); } Index: src/test/org/apache/lucene/TestSearchForDuplicates.java =================================================================== --- src/test/org/apache/lucene/TestSearchForDuplicates.java (revision 718730) +++ src/test/org/apache/lucene/TestSearchForDuplicates.java (working copy) @@ -94,6 +94,9 @@ for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(new Field(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES, Field.Index.ANALYZED)); + + // NOTE: this ID_FIELD produces no tokens since + // SimpleAnalyzer discards numbers d.add(new Field(ID_FIELD, Integer.toString(j), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d); } Index: src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsDocsConsumer { - - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ - abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; - - /** Called when we are done adding docs to this term */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/DocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class DocsConsumer { + + // nocommit + String desc; + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Adds a new doc in this term. Return null if this + * consumer doesn't need to see the positions for this + * doc. */ + abstract PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + + abstract void setField(FieldInfo fieldInfo); + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/DocsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SepSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/SepSkipListWriter.java (revision 0) +++ src/java/org/apache/lucene/index/SepSkipListWriter.java (revision 0) @@ -0,0 +1,173 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexOutput; + + +/** + * Implements the skip list writer for the default posting list format + * that stores positions and payloads. + * + */ +class SepSkipListWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private int[] lastSkipPayloadLength; + private long[] lastSkipDocPointer; + private long[] lastSkipFreqPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayloadPointer; + + private IndexOutput freqOutput; + private IndexOutput docOutput; + // nocommit -- private again + IndexOutput posOutput; + // nocommit -- private again + IndexOutput payloadOutput; + + private int curDoc; + private boolean curStorePayloads; + private int curPayloadLength; + private long curFreqPointer; + private long curDocPointer; + private long curPosPointer; + private long curPayloadPointer; + + SepSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, + IndexOutput freqOutput, + IndexOutput docOutput, + IndexOutput posOutput, + IndexOutput payloadOutput) { + super(skipInterval, numberOfSkipLevels, docCount); + + this.freqOutput = freqOutput; + this.docOutput = docOutput; + this.posOutput = posOutput; + this.payloadOutput = payloadOutput; + + lastSkipDoc = new int[numberOfSkipLevels]; + lastSkipPayloadLength = new int[numberOfSkipLevels]; + lastSkipFreqPointer = new long[numberOfSkipLevels]; + lastSkipDocPointer = new long[numberOfSkipLevels]; + lastSkipPosPointer = new long[numberOfSkipLevels]; + lastSkipPayloadPointer = new long[numberOfSkipLevels]; + } + + void setFreqOutput(IndexOutput freqOutput) { + this.freqOutput = freqOutput; + } + + void setDocOutput(IndexOutput docOutput) { + this.docOutput = docOutput; + } + + void setPosOutput(IndexOutput posOutput) { + this.posOutput = posOutput; + } + + void setPayloadOutput(IndexOutput payloadOutput) { + this.payloadOutput = payloadOutput; + } + + /** + * Sets the values for the current skip data. + */ + void setSkipData(int doc, boolean storePayloads, int payloadLength) { + this.curDoc = doc; + this.curStorePayloads = storePayloads; + this.curPayloadLength = payloadLength; + this.curFreqPointer = freqOutput.getFilePointer(); + this.curDocPointer = docOutput.getFilePointer(); + if (posOutput != null) + this.curPosPointer = posOutput.getFilePointer(); + if (payloadOutput != null) + this.curPayloadPointer = payloadOutput.getFilePointer(); + } + + protected void resetSkip() { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list + Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); + Arrays.fill(lastSkipDocPointer, docOutput.getFilePointer()); + if (posOutput != null) + Arrays.fill(lastSkipPosPointer, posOutput.getFilePointer()); + if (payloadOutput != null) + Arrays.fill(lastSkipPayloadPointer, payloadOutput.getFilePointer()); + + if (PostingsCodec.DEBUG) + System.out.println(" skip writer base freqFP=" + freqOutput.getFilePointer() + " posFP=" + posOutput.getFilePointer()); + } + + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + // To efficiently store payloads in the posting lists we do not store the length of + // every payload. Instead we omit the length for a payload if the previous payload had + // the same length. + // However, in order to support skipping the payload length at every skip point must be known. + // So we use the same length encoding that we use for the posting lists for the skip data as well: + // Case 1: current field does not store payloads + // SkipDatum --> DocSkip, FreqSkip, ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // DocSkip records the document number before every SkipInterval th document in TermFreqs. + // Document numbers are represented as differences from the previous value in the sequence. + // Case 2: current field stores payloads + // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // PayloadLength --> VInt + // In this case DocSkip/2 is the difference between + // the current and the previous value. If DocSkip + // is odd, then a PayloadLength encoded as VInt follows, + // if DocSkip is even, then it is assumed that the + // current payload length equals the length at the previous + // skip point + //System.out.println(" skip writer level=" + level + " curDoc=" + curDoc + " lastDoc=" + lastSkipDoc[level] + " delta=" + (curDoc - lastSkipDoc[level]) + " storePayloads=" + curStorePayloads + " skipBufferFP=" + skipBuffer.getFilePointer()); + if (curStorePayloads) { + int delta = curDoc - lastSkipDoc[level]; + if (curPayloadLength == lastSkipPayloadLength[level]) { + // the current payload length equals the length at the previous skip point, + // so we don't store the length again + skipBuffer.writeVInt(delta << 1); + } else { + // the payload length is different from the previous one. We shift the DocSkip, + // set the lowest bit and store the current payload length as VInt. + skipBuffer.writeVInt(delta << 1 | 1); + skipBuffer.writeVInt(curPayloadLength); + lastSkipPayloadLength[level] = curPayloadLength; + } + } else { + // current field does not store payloads + skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); + } + + // nocommit -- if payloads / pos not stored for this + // field, don't encode these 0's + skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); + skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level])); + skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level])); + skipBuffer.writeVInt((int) (curPayloadPointer - lastSkipPayloadPointer[level])); + + lastSkipDoc[level] = curDoc; + lastSkipFreqPointer[level] = curFreqPointer; + lastSkipDocPointer[level] = curDocPointer; + lastSkipPosPointer[level] = curPosPointer; + lastSkipPayloadPointer[level] = curPayloadPointer; + } +} Property changes on: src/java/org/apache/lucene/index/SepSkipListWriter.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) @@ -0,0 +1,420 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatPostingsDocsReader extends FormatPostingsTermsDictDocsReader { + + final IndexInput freqIn; + IndexInput termsIn; + + private final FormatPostingsPositionsReader posReader; + + int skipInterval; + int maxSkipLevels; + + FormatPostingsDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION), readBufferSize); + + boolean success = false; + try { + if (segmentInfo.getHasProx()) + posReader = new FormatPostingsPositionsReader(dir, segmentInfo, readBufferSize); + else + posReader = null; + success = true; + } finally { + if (!success) + freqIn.close(); + } + } + + static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION)); + FormatPostingsPositionsReader.files(segmentInfo, files); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // Make sure we are talking to the matching past writer + PostingsCodec.checkHeader(termsIn, FormatPostingsDocsWriter.CODEC, FormatPostingsDocsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + if (posReader != null) + posReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + + final FormatPostingsPositionsReader.TermsDictReader posReader2; + if (posReader != null && !fieldInfo.omitTf) + posReader2 = (FormatPostingsPositionsReader.TermsDictReader) posReader.reader(fieldInfo, termsIn); + else + posReader2 = null; + + return new TermsDictReader(fieldInfo, posReader2, termsIn); + } + + void close() throws IOException { + try { + freqIn.close(); + } finally { + if (posReader != null) + posReader.close(); + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long freqOffset; + long skipOffset; + int docFreq; + + // TODO: abstraction violation (we are storing this with + // the concrete impl. as the type, not the abstract base + // class) + final FormatPostingsPositionsReader.TermsDictReader posReader; + private SegmentDocsEnum docs; + + TermsDictReader(FieldInfo fieldInfo, FormatPostingsPositionsReader.TermsDictReader posReader, IndexInput termsIn) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.posReader = posReader; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + + this.docFreq = docFreq; + if (PostingsCodec.DEBUG) + System.out.println(" dr.readTerm termsInPointer=" + termsIn.getFilePointer() + " df=" + docFreq + " isIndex=" + isIndexTerm); + + if (isIndexTerm) + freqOffset = termsIn.readVLong(); + else + freqOffset += termsIn.readVLong(); + + if (PostingsCodec.DEBUG) + System.out.println(" freqOffset=" + freqOffset + " vs len=" + freqIn.length()); + + if (docFreq >= skipInterval) + skipOffset = termsIn.readVLong(); + else + skipOffset = 0; + + if (posReader != null) + posReader.readTerm(docFreq, isIndexTerm); + } + + public void close() throws IOException { + if (posReader != null) + posReader.close(); + } + + DocsEnum docs(BitVector deletedDocs) throws IOException { + + if (docs == null) + // Lazy init + docs = new SegmentDocsEnum(); + + docs.init(deletedDocs); + + return docs; + } + + class SegmentDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long skipStart; + long freqStart; + final IndexInput freqIn; + // nocommit -- should we do omitTF with 2 different enum classes? + final boolean omitTF; + private BitVector deletedDocs; + + // nocommit -- should we do hasProx with 2 different enum classes? + + boolean skipped; + DefaultSkipListReader skipper; + + // TODO: abstraction violation: we are storing the + // concrete impl, not the abstract base class + FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum positions; + + SegmentDocsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new docs enum"); + this.freqIn = (IndexInput) FormatPostingsDocsReader.this.freqIn.clone(); + omitTF = fieldInfo.omitTf; + if (omitTF) + freq = 1; + } + + void close() { + } + + void init(BitVector deletedDocs) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("[" + desc + "] dr.init freqIn seek " + freqOffset + " this=" + this + " (in=" + freqIn + "; this=" + this + ")"); + this.deletedDocs = deletedDocs; + freqIn.seek(freqOffset); + this.docFreq = TermsDictReader.this.docFreq; + count = 0; + doc = 0; + skipped = false; + skipStart = freqStart + skipOffset; + proxSkipFreq = 0; + + // maybe not necessary? + proxSkipPayloadLength = -1; + + // TODO: abstraction violation + if (posReader != null) + proxOffset = posReader.proxOffset; + } + + boolean next() throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " freq pointer=" + freqIn.getFilePointer() + " (in=" + freqIn + "; this=" + this + ") + has del docs=" + (deletedDocs != null) ); + + // new Throwable().printStackTrace(System.out); + + while(true) { + if (count == docFreq) + return false; + + count++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (PostingsCodec.DEBUG) + System.out.println(" read code=" + code); + if (omitTF) + doc += code; + else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) + break; + else if (PostingsCodec.DEBUG) + System.out.println(" doc=" + doc + " is deleted"); + } + + // nocommit + if (PostingsCodec.DEBUG && positions != null) + positions.desc = desc + ":" + doc; + + return true; + } + + int read(int[] docs, int[] freqs) throws IOException { + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + freq = 1; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + int doc() { + return doc; + } + + int ord() { + assert count > 0; + return count-1; + } + + int freq() { + return freq; + } + + long proxOffset; + int proxSkipPayloadLength = -1; + int proxSkipFreq; + PositionsEnum fakePositions; + + PositionsEnum positions() throws IOException { + if (positions == null) { + // Lazy init + if (posReader == null) { + // TermFreq was omitted from this field during + // indexing, which means we pretend termFreq is + // always 1 with that 1 occurrence having + // position 0 + if (fakePositions == null) + fakePositions = new FormatPostingsFakePositionsEnum(); + return fakePositions; + } else { + // TODO: abstraction violation + positions = (FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum) posReader.positions(); + if (PostingsCodec.DEBUG) + System.out.println("pos skip proxOffset=" + proxOffset + " payloadlen=" + proxSkipPayloadLength + " skipPosCount= " + proxSkipFreq); + positions.skip(proxOffset, proxSkipPayloadLength, proxSkipFreq); + } + } + + if (PostingsCodec.DEBUG) + positions.desc = desc + ":" + doc; + + positions.catchUp(freq); + + return positions; + } + + boolean skip(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skip to target=" + target); + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + if (skipper == null) + // Lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + + if (!skipped) { + + // We haven't already skipped for this posting, + // so now we init the skipper + + // TODO: this is abstraction violation; instead, + // skipper should interact with this as a + // private consumer + skipper.init(freqOffset+skipStart, + freqOffset, proxOffset, + docFreq, fieldInfo.storePayloads); + + if (PostingsCodec.DEBUG) + System.out.println(" skip reader base freqFP=" + (freqOffset+skipStart) + " freqFP=" + freqOffset + " proxFP=" + proxOffset); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skipper moved to newCount=" + newCount + " freqFP=" + skipper.getFreqPointer() + " proxFP=" + skipper.getProxPointer() + " doc=" + skipper.getDoc()); + + // Skipper did move + freqIn.seek(skipper.getFreqPointer()); + count = newCount; + doc = skipper.getDoc(); + + // TODO: abstraction violation; this should be a + // private interaction b/w skipper & posReader + if (positions != null) + // nocommit -- should that be count? + positions.skip(skipper.getProxPointer(), skipper.getPayloadLength(), 0); + else { + proxOffset = skipper.getProxPointer(); + proxSkipPayloadLength = skipper.getPayloadLength(); + // nocommit -- should that be count? + proxSkipFreq = 0; + } + } else if (PostingsCodec.DEBUG) + System.out.println(" no skipping to be done"); + } + + // Now, linear scan for the rest: + do { + if (!next()) + return false; + } while (target > doc); + + return true; + } + } + } +} + +/** Returned when someone asks for positions() enum on field + * with omitTf true */ +class FormatPostingsFakePositionsEnum extends PositionsEnum { + int next() { + return 0; + } + int getPayloadLength() { + return 0; + } + boolean hasPayload() { + return false; + } + byte[] getPayload(byte[] data, int offset) { + return null; + } +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/DirectoryIndexReader.java =================================================================== --- src/java/org/apache/lucene/index/DirectoryIndexReader.java (revision 718730) +++ src/java/org/apache/lucene/index/DirectoryIndexReader.java (working copy) @@ -255,6 +255,8 @@ if(closeDirectory) directory.close(); } + + final PostingsCodec codec = PostingsCodec.getCodec(); /** * Commit changes resulting from delete, undeleteAll, or @@ -273,7 +275,7 @@ // KeepOnlyLastCommitDeleter: IndexFileDeleter deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, null, null); + segmentInfos, null, null, codec); // Checkpoint the state we are about to change, in // case we have to roll back: Index: src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (working copy) @@ -25,36 +25,68 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.store.IndexOutput; -final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer { +final class FormatPostingsDocsWriter extends DocsConsumer { + final static String CODEC = "SingleFileDocFreqSkip"; + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + final IndexOutput out; - final FormatPostingsTermsWriter parent; final FormatPostingsPositionsWriter posWriter; final DefaultSkipListWriter skipListWriter; final int skipInterval; + final int maxSkipLevels; final int totalNumDocs; + IndexOutput termsOut; boolean omitTF; boolean storePayloads; + // Starts a new term + long lastFreqStart; long freqStart; FieldInfo fieldInfo; - FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException { + FormatPostingsDocsWriter(SegmentWriteState state) throws IOException { super(); - this.parent = parent; - final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); + final String fileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.FREQ_EXTENSION); state.flushedFiles.add(fileName); - out = parent.parent.dir.createOutput(fileName); - totalNumDocs = parent.parent.totalNumDocs; + out = state.directory.createOutput(fileName); + totalNumDocs = state.numDocs; - // TODO: abstraction violation - skipInterval = parent.parent.termsOut.skipInterval; - skipListWriter = parent.parent.skipListWriter; - skipListWriter.setFreqOutput(out); + // nocommit -- abstraction violation + skipListWriter = new DefaultSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + out, + null); + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + posWriter = new FormatPostingsPositionsWriter(state, this); } + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + PostingsCodec.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + posWriter.start(termsOut); + } + + void startTerm() { + freqStart = out.getFilePointer(); + if (!omitTF) + posWriter.startTerm(); + skipListWriter.resetSkip(); + } + + // nocommit -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTF = fieldInfo.omitTf; @@ -65,11 +97,15 @@ int lastDocID; int df; + int count; + /** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ - FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; + if (PostingsCodec.DEBUG) + System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq + " freqPointer=" + out.getFilePointer()); if (docID < 0 || (df > 0 && delta <= 0)) throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); @@ -78,8 +114,12 @@ // TODO: abstraction violation skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); skipListWriter.bufferSkip(df); + if (PostingsCodec.DEBUG) + System.out.println(" bufferSkip lastDocID=" + lastDocID + " df=" + df + " freqFP=" + out.getFilePointer() + " proxFP=" + skipListWriter.proxOutput.getFilePointer()); } + // nocommit -- move this assert up above; every consumer + // shouldn't have to check for this bug: assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; lastDocID = docID; @@ -92,36 +132,56 @@ out.writeVInt(termDocFreq); } - return posWriter; + // nocommit + if (PostingsCodec.DEBUG) + ((FormatPostingsPositionsWriter) posWriter).desc = desc + ":" + docID; + + if (omitTF) + return null; + else + return posWriter; } - private final TermInfo termInfo = new TermInfo(); // minimize consing - final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); - /** Called when we are done adding docs to this term */ - void finish() throws IOException { - long skipPointer = skipListWriter.writeSkip(out); + void finishTerm(int docCount, boolean isIndexTerm) throws IOException { - // TODO: this is abstraction violation -- we should not - // peek up into parents terms encoding format - termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); + // nocommit -- wasteful we are counting this in two places? + assert docCount == df; + if (PostingsCodec.DEBUG) + System.out.println("dw.finishTerm termsOut pointer=" + termsOut.getFilePointer() + " freqStart=" + freqStart + " df=" + df); - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); + if (isIndexTerm) + // Write absolute at seek points + termsOut.writeVLong(freqStart); + else + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); - if (df > 0) { - parent.termsOut.add(fieldInfo.number, - utf8.result, - utf8.length, - termInfo); + lastFreqStart = freqStart; + + if (df >= skipInterval) { + if (PostingsCodec.DEBUG) + System.out.println(" writeSkip @ freqFP=" + out.getFilePointer() + " freqStartFP=" + freqStart); + termsOut.writeVLong(skipListWriter.writeSkip(out)-freqStart); } + if (!omitTF) + posWriter.finishTerm(isIndexTerm); + lastDocID = 0; df = 0; + + // nocommit + count = 0; } void close() throws IOException { - out.close(); - posWriter.close(); + if (PostingsCodec.DEBUG) + System.out.println("docs writer close pointer=" + out.getFilePointer()); + try { + out.close(); + } finally { + posWriter.close(); + } } } Index: src/java/org/apache/lucene/index/IndexFileNames.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileNames.java (revision 718730) +++ src/java/org/apache/lucene/index/IndexFileNames.java (working copy) @@ -109,6 +109,10 @@ GEN_EXTENSION, NORMS_EXTENSION, COMPOUND_FILE_STORE_EXTENSION, + // nocommit -- need cleaner way! + "doc", + "pyl", + "skp" }; /** File extensions that are added to a compound file @@ -154,6 +158,12 @@ TERMS_INDEX_EXTENSION, TERMS_EXTENSION }; + + static final String COMPOUND_EXTENSIONS_NOT_CODEC[] = new String[] { + FIELD_INFOS_EXTENSION, + FIELDS_INDEX_EXTENSION, + FIELDS_EXTENSION, + }; /** File extensions for term vector support */ static final String VECTOR_EXTENSIONS[] = new String[] { Index: src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { - - final Directory dir; - final String segment; - final TermInfosWriter termsOut; - final FieldInfos fieldInfos; - final FormatPostingsTermsWriter termsWriter; - final DefaultSkipListWriter skipListWriter; - final int totalNumDocs; - - public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { - super(); - - dir = state.directory; - segment = state.segmentName; - totalNumDocs = state.numDocs; - this.fieldInfos = fieldInfos; - termsOut = new TermInfosWriter(dir, - segment, - fieldInfos, - state.termIndexInterval); - - // TODO: this is a nasty abstraction violation (that we - // peek down to find freqOut/proxOut) -- we need a - // better abstraction here whereby these child consumers - // can provide skip data or not - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - totalNumDocs, - null, - null); - - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); - - termsWriter = new FormatPostingsTermsWriter(state, this); - } - - /** Add a new field */ - FormatPostingsTermsConsumer addField(FieldInfo field) { - termsWriter.setField(field); - return termsWriter; - } - - /** Called when we are done adding everything. */ - void finish() throws IOException { - termsOut.close(); - termsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/TermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsConsumer { + + /** Starts a new term in this field; term ends with U+FFFF + * char */ + abstract DocsConsumer startTerm(char[] text, int start) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(char[] text, int start, int numDocs) throws IOException; + + /** Called when we are done adding terms to this field */ + abstract void finish() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/TermsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.util.ArrayUtil; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsTermsConsumer { - - /** Adds a new term in this field; term ends with U+FFFF - * char */ - abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException; - - char[] termBuffer; - FormatPostingsDocsConsumer addTerm(String text) throws IOException { - final int len = text.length(); - if (termBuffer == null || termBuffer.length < 1+len) - termBuffer = new char[ArrayUtil.getNextSize(1+len)]; - text.getChars(0, len, termBuffer, 0); - termBuffer[len] = 0xffff; - return addTerm(termBuffer, 0); - } - - /** Called when we are done adding terms to this field */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) @@ -0,0 +1,234 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.util.BitVector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; + +// nocommit -- base class should not be named terms dict: +// this class interacts w/ a docsreader +class FormatPostingsPositionsReader extends FormatPostingsTermsDictPositionsReader { + + final IndexInput proxIn; + IndexInput termsIn; + + FormatPostingsPositionsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + assert segmentInfo.getHasProx(); + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION), readBufferSize); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + PostingsCodec.checkHeader(termsIn, FormatPostingsPositionsWriter.CODEC, FormatPostingsPositionsWriter.VERSION_START); + } + + static void files(SegmentInfo segmentInfo, Collection files) { + if (segmentInfo.getHasProx()) + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION)); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + return new TermsDictReader(termsIn, fieldInfo); + } + + void close() throws IOException { + if (proxIn != null) + proxIn.close(); + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long proxOffset; + + TermsDictReader(IndexInput termsIn, FieldInfo fieldInfo) { + this.termsIn = termsIn; + this.fieldInfo = fieldInfo; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println(" pr.readterm termsInPointer=" + termsIn.getFilePointer() + " isIndex=" + isIndexTerm); + if (isIndexTerm) + proxOffset = termsIn.readVLong(); + else + proxOffset += termsIn.readVLong(); + if (PostingsCodec.DEBUG) + System.out.println(" proxOffset=" + proxOffset); + if (positions != null) { + positions.seekPending = true; + positions.skipOffset = proxOffset; + positions.skipPosCount = 0; + } + } + + void close() throws IOException { + } + + SegmentPositionsEnum positions; + + PositionsEnum positions() throws IOException { + + if (positions == null) + // Lazy init + positions = new SegmentPositionsEnum(); + + return positions; + } + + // nocommit -- should we have different reader for + // payload vs no payload? + class SegmentPositionsEnum extends PositionsEnum { + + // nocommit + String desc; + + final IndexInput proxIn; + + final boolean storePayloads; + + boolean seekPending; // True if we must seek before reading next position + boolean payloadPending; // True if we must skip payload beore reading next position + + long skipOffset; + int skipPosCount; + + int position; + int payloadLength; + + SegmentPositionsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new pos enum"); + proxIn = (IndexInput) FormatPostingsPositionsReader.this.proxIn.clone(); + storePayloads = fieldInfo.storePayloads; + } + + void skip(long proxOffset, int lastPayloadLength, int numPositions) { + skipOffset = proxOffset; + payloadLength = lastPayloadLength; + assert payloadLength >= 0 || payloadLength == -1; + skipPosCount = numPositions; + seekPending = true; + payloadPending = false; + if (PostingsCodec.DEBUG) + System.out.println("pr [" + desc + "] skip fp= " + proxOffset + " numPositions=" + numPositions); + } + + void skip(int numPositions) { + skipPosCount += numPositions; + if (PostingsCodec.DEBUG) + System.out.println("pr [" + desc + "] skip " + numPositions + " positions; now " + skipPosCount); + } + + void catchUp(int currentCount) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println(" pos catchup: seekPending=" + seekPending + " skipOffset=" + skipOffset + " skipPosCount " + skipPosCount + " vs currentCount " + currentCount); + if (seekPending) { + proxIn.seek(skipOffset); + seekPending = false; + } + + while(skipPosCount > currentCount) + next(); + if (PostingsCodec.DEBUG) + System.out.println(" pos catchup done"); + positions.init(); + } + + void init() { + if (PostingsCodec.DEBUG) + System.out.println(" pos init"); + position = 0; + } + + int next() throws IOException { + + if (PostingsCodec.DEBUG) + System.out.println(" pr.next [" + desc + "]: fp=" + proxIn.getFilePointer() + " return pos=" + position); + + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + if (PostingsCodec.DEBUG) + System.out.println(" payload pending: skip " + payloadLength + " bytes"); + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + if (PostingsCodec.DEBUG) + System.out.println(" new payloadLen=" + payloadLength); + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else + position += proxIn.readVInt(); + + skipPosCount--; + + // NOTE: the old API actually allowed this... + assert skipPosCount >= 0: "next() was called too many times (more than FormatPostingsDocsEnum.freq() times)"; + + if (PostingsCodec.DEBUG) + System.out.println(" proxFP=" + proxIn.getFilePointer() + " return pos=" + position); + return position; + } + + int getPayloadLength() { + return payloadLength; + } + + byte[] getPayload(byte[] data, int offset) throws IOException { + + if (!payloadPending) + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + + final byte[] retArray; + final int retOffset; + if (data == null || data.length-offset < payloadLength) { + // the array is too small to store the payload data, + // so we allocate a new one + retArray = new byte[payloadLength]; + retOffset = 0; + } else { + retArray = data; + retOffset = offset; + } + + proxIn.readBytes(retArray, retOffset, payloadLength); + payloadPending = false; + return retArray; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } + } +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -78,6 +78,7 @@ // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTf==false + private boolean flexPostings; // True if postings were written with new flex format public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; @@ -92,6 +93,7 @@ docStoreIsCompoundFile = false; delCount = 0; hasProx = true; + flexPostings = true; } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { @@ -108,6 +110,7 @@ this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; + flexPostings = true; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } @@ -188,6 +191,12 @@ hasProx = input.readByte() == 1; else hasProx = true; + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + flexPostings = input.readByte() == 1; + else + flexPostings = false; + } else { delGen = CHECK_DIR; normGen = null; @@ -294,6 +303,8 @@ si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; + si.hasProx = hasProx; + si.flexPostings = flexPostings; return si; } @@ -517,6 +528,7 @@ output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); + output.writeByte((byte) (flexPostings ? 1:0)); } void setHasProx(boolean hasProx) { @@ -528,6 +540,10 @@ return hasProx; } + boolean getFlexPostings() { + return flexPostings; + } + private void addIfExists(List files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); @@ -540,7 +556,15 @@ */ public List files() throws IOException { + return files(null); + } + public List files(PostingsCodec codec) throws IOException { + + // nocommit -- higher up + if (codec == null) + codec = PostingsCodec.getCodec(); + if (files != null) { // Already cached: return files; @@ -555,7 +579,13 @@ } else { final String[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS; for(int i=0;i 0 terms + Iterator it = terms2.fields.keySet().iterator(); + int i = 0; + while(it.hasNext()) + fields[i++] = ((FieldInfo) it.next()).name; + Arrays.sort(fields); + fieldUpto = -1; + /* + System.out.println("sr.tdte: " + numFields + " fields"); + for(i=0;i= fields.length-1) + return false; + fieldUpto++; + //System.out.println("sr.tdte: now get new field fieldUpto=" + fieldUpto + " name=" + fields[fieldUpto]); + currentField = terms.getField(fieldInfos.fieldInfo(fields[fieldUpto])).terms(); + } + if (currentField.next()) + // This field still has terms + return true; + else { + // Done producing terms from this field + currentField.close(); + currentField = null; + } + } + } + + public Term term() { + if (currentField != null) { + final String text = currentField.text(); + if (text != null) + return new Term(fields[fieldUpto], text); + } + return null; + } + + public int docFreq() { + if (currentField == null) + return 0; + else + return currentField.docFreq(); + } + + public void close() throws IOException { + if (currentField != null) { + currentField.close(); + currentField = null; + } + fieldUpto = fields.length; + } + + // Seek forward only + public boolean skipTo(Term target) throws IOException { + // Just use seek, if the target is beyond our current + // point, else next(): + + if (fieldUpto >= fields.length) + // Already EOF + return false; + + if (fieldUpto >= 0) { + + final int cmp = target.field.compareTo(fields[fieldUpto]); + if (cmp < 0) + // Target is before our current term + return next(); + else if (cmp == 0) { + final int cmp2 = target.text.compareTo(currentField.text()); + if (cmp2 < 0) + // Target is before our current term + return next(); + } + } + + // OK target is in the future, so just seek + return seek(target); + } + + public boolean seek(Term target) throws IOException { + + if (currentField == null || !fields[fieldUpto].equals(target.field)) { + // Seek field + if (currentField != null) { + currentField.close(); + currentField = null; + } + + // nocommit -- binary search + fieldUpto = 0; + int cmp = 0; + + while(fieldUpto < fields.length) { + cmp = target.field.compareTo(fields[fieldUpto]); + if (cmp == 0) + break; + else if (cmp < 0) { + fieldUpto--; + return next(); + } + fieldUpto++; + } + + if (fieldUpto == fields.length) + return false; + + currentField = terms.getField(fieldInfos.fieldInfo(fields[fieldUpto])).terms(); + + assert currentField != null; + assert fields[fieldUpto].equals(target.field); + } + + // Field matches; now seek text + currentField.seek(target.text); + return currentField.text() != null; + } + } + + // Back compat + class TermsDictTermDocs implements TermDocs { + + String currentField; + TermsEnum currentFieldTerms; + DocsEnum docs; + + public void close() throws IOException { + if (docs != null) { + docs.close(); + docs = null; + } + if (currentFieldTerms != null) { + currentFieldTerms.close(); + currentFieldTerms = null; + } + } + + public void seek(TermEnum termEnum) throws IOException { + // nocommit -- optimize for the special cases here + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (docs == null) return false; + return docs.skip(target); + } + + public int read(int[] docs, int[] freqs) throws IOException { + if (this.docs == null) return 0; + return this.docs.read(docs, freqs); + } + + public void seek(Term term) throws IOException { + + if (PostingsCodec.DEBUG) + System.out.println("\nwrapper termdocs.seek term=" + term); + + if (currentField != null && !term.field.equals(currentField)) { + if (PostingsCodec.DEBUG) + System.out.println(" clear current field " + currentField); + if (currentFieldTerms != null) { + currentFieldTerms.close(); + currentFieldTerms = null; + } + currentField = null; + } + + if (currentFieldTerms == null) { + currentField = term.field; + TermsProducer field = terms.getField(fieldInfos.fieldInfo(term.field)); + if (PostingsCodec.DEBUG) + System.out.println(" lookup field=" + field); + if (field != null) { + currentFieldTerms = field.terms(); + if (PostingsCodec.DEBUG) + System.out.println(" got terms=" + currentFieldTerms); + } + } + + if (currentFieldTerms != null) { + if (currentFieldTerms.seek(term.text)) { + if (PostingsCodec.DEBUG) + System.out.println(" seek true: " + currentFieldTerms.text()); + if (currentFieldTerms.text().equals(term.text)) + docs = currentFieldTerms.docs(deletedDocs); + else + docs = null; + } else + docs = null; + } else + docs = null; + } + + public int doc() { + if (docs == null) return 0; + return docs.doc(); + } + + public int freq() { + if (docs == null) return 0; + return docs.freq(); + } + + public boolean next() throws IOException { + if (docs == null) return false; + return docs.next(); + } + } + + // Back compat + final class TermsDictTermPositions extends TermsDictTermDocs implements TermPositions { + + PositionsEnum positions; + + public void seek(TermEnum termEnum) throws IOException { + super.seek(termEnum); + if (docs != null) + positions = docs.positions(); + } + + public boolean skipTo(int target) throws IOException { + boolean result = super.skipTo(target); + if (result && docs != null) + positions = docs.positions(); + else + positions = null; + return result; + } + + public int read(int[] docs, int[] freqs) throws IOException { + throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); + } + + public void seek(Term term) throws IOException { + super.seek(term); + if (docs != null) + positions = docs.positions(); + else + positions = null; + } + + public boolean next() throws IOException { + boolean result = super.next(); + if (result && docs != null) + positions = docs.positions(); + else + positions = null; + return result; + } + + public int nextPosition() throws IOException { + return positions.next(); + } + + public int getPayloadLength() { + return positions.getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return positions.getPayload(data, offset); + } + + public boolean isPayloadAvailable() { + return positions.hasPayload(); + } + } +} \ No newline at end of file Index: src/java/org/apache/lucene/index/SegmentTermEnum.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermEnum.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentTermEnum.java (working copy) @@ -20,6 +20,10 @@ import java.io.IOException; import org.apache.lucene.store.IndexInput; +/** + * @deprecated No longer used with flex indexing, except for + * reading old segments */ + final class SegmentTermEnum extends TermEnum implements Cloneable { private IndexInput input; FieldInfos fieldInfos; Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java (revision 0) @@ -0,0 +1,453 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; +import java.util.HashMap; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitVector; + +class FormatPostingsTermsDictReader extends FieldsProducer { + private final IndexInput in; + private final IndexInput indexIn; + private final int indexInterval; + private final FormatPostingsTermsDictDocsReader docs; + + private int indexDivisor = 1; + private boolean anyIndexRead; + + int totalIndexInterval; + + private final FieldInfos fieldInfos; + final Map fields = new HashMap(); + private final String segment; + + FormatPostingsTermsDictReader(Directory dir, FieldInfos fieldInfos, String segment, FormatPostingsTermsDictDocsReader docs, int readBufferSize) throws IOException { + in = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), readBufferSize); + boolean success = false; + try { + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), readBufferSize); + success = true; + } finally { + if (!success) + in.close(); + } + + success = false; + try { + + this.fieldInfos = fieldInfos; + this.segment = segment; + int format = in.readInt(); + if (format != FormatPostingsTermsDictWriter.FORMAT) + throw new CorruptIndexException("format mismatch"); + + final long dirOffset = in.readLong(); + indexInterval = in.readInt(); + totalIndexInterval = indexInterval; + + this.docs = docs; + docs.start(in); + in.seek(dirOffset); + + final int numFields = in.readInt(); + // nocommit -- why did i want to order by field number? + //int lastFieldNumber = -1; + for(int i=0;i lastFieldNumber; + //lastFieldNumber = field; + final long numTerms = in.readLong(); + final long termsStartPointer = in.readLong(); + final long indexStartPointer = in.readLong(); + if (numTerms > 0) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fields.put(fieldInfo, new FieldReader(fieldInfo, numTerms, termsStartPointer, indexStartPointer)); + } + } + success = true; + } finally { + if (!success) { + try { + in.close(); + } finally { + indexIn.close(); + } + } + } + } + + public TermsProducer getField(FieldInfo fieldInfo) { + return (TermsProducer) fields.get(fieldInfo); + } + + public void close() throws IOException { + try { + in.close(); + } finally { + try { + indexIn.close(); + } finally { + docs.close(); + } + } + } + + static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.TERMS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.TERMS_INDEX_EXTENSION)); + } + + /** + *

Sets the indexDivisor, which subsamples the number + * of indexed terms loaded into memory. This has a + * similar effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1.

+ * + * NOTE: you must call this before the term + * index is loaded. If the index is already loaded, + * an IllegalStateException is thrown. + * + + @throws IllegalStateException if the term index has + * already been loaded into memory. + */ + public void setIndexDivisor(int indexDivisor) throws IllegalStateException { + if (indexDivisor < 1) + throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor); + + if (anyIndexRead) + throw new IllegalStateException("index terms are already loaded"); + + this.indexDivisor = indexDivisor; + totalIndexInterval = indexInterval * indexDivisor; + } + + /** Returns the indexDivisor. + * @see #setIndexDivisor + */ + public int getIndexDivisor() { + return indexDivisor; + } + + // nocommit -- static? + private class FieldReader extends TermsProducer { + + final long numTerms; + final FieldInfo fieldInfo; + final long indexStartPointer; + final long termsStartPointer; + + // TODO: genericize "skipper" API so that we could swap + // in a multi-level skipper, here, instead of flat one: + // TODO: we could save mem here by packing our own shared char[]'s + String[] indexTerms; + long[] indexOffsets; + + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long indexStartPointer) { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + assert numTerms > 0; + this.indexStartPointer = indexStartPointer; + this.termsStartPointer = termsStartPointer; + } + + synchronized final void readIndex() throws IOException { + if (indexTerms != null) + return; + + final int indexSize = (int) (1+(numTerms-1)/totalIndexInterval); + + if (PostingsCodec.DEBUG) + System.out.println(" tdr.readIndex field=" + fieldInfo.name + " numTerms=" + numTerms + " indexSize=" + indexSize + " indexSeek=" + indexStartPointer + " segment=" + segment + " indexDivisor=" + indexDivisor); + + IndexInput in = (IndexInput) indexIn.clone(); + in.seek(indexStartPointer); + + indexTerms = new String[indexSize]; + indexOffsets = new long[indexSize]; + + if (PostingsCodec.DEBUG) + System.out.println("read index for field=" + fieldInfo.name); + + long pointer = termsStartPointer; + final DeltaBytesReader bytesReader = new DeltaBytesReader(in); + final int numIndexTerms = (int) (1+(numTerms-1)/indexInterval); + int upto = 0; + for(int i=0;i= numTerms) { + termUpto++; + return false; + } + if (PostingsCodec.DEBUG) { + System.out.println("tdr.next: field=" + fieldInfo.name + " termsInPointer=" + in.getFilePointer() + " vs len=" + in.length() + " isIndex=" + (termUpto%indexInterval==0) + " this=" + this); + //new Throwable().printStackTrace(System.out); + } + bytesReader.read(); + docFreq = in.readVInt(); + if (PostingsCodec.DEBUG) + System.out.println(" text=" + bytesReader.text() + " freq=" + docFreq); + docs.readTerm(docFreq, termUpto % indexInterval == 0); + termUpto++; + if (PostingsCodec.DEBUG) + System.out.println(" termUpto=" + termUpto + " vs numTerms=" + numTerms + " fp=" + in.getFilePointer()); + return true; + } + + public int docFreq() { + if (termUpto >= 1+numTerms) + return 0; + else + return docFreq; + } + + public String text() { + // nocommit -- really necessary? + if (termUpto >= 1+numTerms) + return null; + else + return bytesReader.text(); + } + + public long ord() { + return termUpto-1; + } + + public DocsEnum docs(BitVector deletedDocs) throws IOException { + doSkip = false; + // nocommit + DocsEnum docsEnum = docs.docs(deletedDocs); + docsEnum.desc = fieldInfo.name + ":" + bytesReader.text(); + return docsEnum; + } + + public void close() throws IOException { + in.close(); + docs.close(); + } + } + } + + private static class DeltaBytesReader { + private byte[] bytes; + final UnicodeUtil.UTF16Result chars = new UnicodeUtil.UTF16Result(); + final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); + private int length; + final IndexInput in; + boolean started; + + DeltaBytesReader(IndexInput in) { + this.in = in; + bytes = new byte[10]; + } + + void reset(String text) { + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); + if (utf8.length > bytes.length) + bytes = ArrayUtil.grow(bytes, utf8.length); + System.arraycopy(utf8.result, 0, + this.bytes, 0, utf8.length); + this.length = utf8.length; + chars.copyText(text); + } + + String text() { + // nocommit -- cache this? + return new String(chars.result, 0, chars.length); + } + + int compareTo(String other) { + + final int otherLength = other.length(); + final int minLength; + if (otherLength < chars.length) + minLength = otherLength; + else + minLength = chars.length; + + for(int i=0;i otherC) + return 1; + } + + if (chars.length < otherLength) + return -1; + else if (chars.length > otherLength) + return 1; + else + return 0; + } + + void read() throws IOException { + //System.out.println("terms reader fp=" + in.getFilePointer() + " this=" + this); + final int start = in.readVInt(); + final int suffix = in.readVInt(); + //System.out.println(" start=" + start + " suffix=" + suffix); + assert start <= length: "start=" + start + " length=" + length; + + if (start + suffix > bytes.length) + bytes = ArrayUtil.grow(bytes, start+suffix); + in.readBytes(bytes, start, suffix); + length = start + suffix; + + // TODO: conversion could be incremental + UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars); + started = true; + } + } + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitVector; + +/** TermsDictReader interacts with a single instance of this + * to manage creation of multiple docs enum + * instances. */ +abstract class FormatPostingsTermsDictDocsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a docs enum for the last term read */ + abstract DocsEnum docs(BitVector deletedDocs) throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + /** Returns a new private reader for stepping through + * terms, getting DocsEnum. */ + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) @@ -0,0 +1,242 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; + +/** + * Writes terms dict and interacts with docs/positions + * consumers to write the postings files. + * + * The [new] terms dict format is field-centric: each field + * has its own section in the file. Fields are written in + * UTF16 string comparison order. Within each field, each + * term's text is written in UTF16 string comparison order. + */ + +class FormatPostingsTermsDictWriter extends FieldsConsumer { + + // Initial format + public static final int FORMAT = -1; + + public static final int FORMAT_CURRENT = FORMAT; + + private final int indexInterval; + private final DeltaBytesWriter termWriter; + private final DeltaBytesWriter termIndexWriter; + + final IndexOutput out; + final IndexOutput indexOut; + final DocsConsumer consumer; + final FieldInfos fieldInfos; + FieldInfo currentField; + + private List fields = new ArrayList(); + + // nocommit + private String segment; + + FormatPostingsTermsDictWriter(SegmentWriteState state, DocsConsumer consumer) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName); + state.flushedFiles.add(termsFileName); + this.segment = state.segmentName; + + if (PostingsCodec.DEBUG) + System.out.println("tdw: write to segment=" + state.segmentName); + + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexFileName); + state.flushedFiles.add(indexFileName); + + fieldInfos = state.fieldInfos; + indexInterval = state.termIndexInterval; + + // Count indexed fields up front + final int numFields = fieldInfos.size(); + + out.writeInt(FORMAT_CURRENT); // write format + out.writeLong(0); // leave space for end index pointer + out.writeInt(indexInterval); // write indexInterval + + termWriter = new DeltaBytesWriter(out); + termIndexWriter = new DeltaBytesWriter(indexOut); + currentField = null; + this.consumer = consumer; + + consumer.start(out); // have consumer write its format/header + } + + TermsConsumer addField(FieldInfo field) { + if (PostingsCodec.DEBUG) + System.out.println("tdw.addField: field=" + field.name); + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + TermsConsumer terms = new TermsWriter(field, consumer); + fields.add(terms); + return terms; + } + + void close() throws IOException { + try { + final long indexPointer = out.getFilePointer(); + final int fieldCount = fields.size(); + out.writeInt(fieldCount); + for(int i=0;i= 0 || payloadLength == -1; + skipPosCount = numPositions; + seekPending = true; + payloadPending = false; + if (DEBUG) + System.out.println("pr [" + desc + "] skip posFP= " + posOffset + " payloadFP=" + payloadOffset + " numPositions=" + numPositions); + } + + void skip(int numPositions) { + skipPosCount += numPositions; + if (DEBUG) + System.out.println("pr [" + desc + "] skip " + numPositions + " positions; now " + skipPosCount); + } + + void catchUp(int currentCount) throws IOException { + if (DEBUG) + System.out.println(" pos catchup: seekPending=" + seekPending + " skipPosFP=" + skipPosOffset + " skipPayloadFP=" + skipPayloadOffset + " skipPosCount " + skipPosCount + " vs currentCount " + currentCount); + if (seekPending) { + posIn.seek(skipPosOffset); + if (storePayloads) + payloadIn.seek(skipPayloadOffset); + seekPending = false; + } + + while(skipPosCount > currentCount) + next(); + if (DEBUG) + System.out.println(" pos catchup done"); + positions.init(); + } + + void init() { + if (DEBUG) + System.out.println(" pos init"); + position = 0; + } + + int next() throws IOException { + + if (DEBUG) + System.out.println(" pr.next [" + desc + "]: posFP=" + posIn.getFilePointer() + " return pos=" + position); + + final int code = posIn.readVInt(); + + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + if (DEBUG) + System.out.println(" payload pending: skip " + payloadLength + " bytes"); + // TODO: we could do this lazily, when + // getPayload() is called + payloadIn.seek(payloadIn.getFilePointer()+payloadLength); + } + + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = posIn.readVInt(); + assert payloadLength >= 0; + if (DEBUG) + System.out.println(" new payloadLen=" + payloadLength); + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else + position += code; + + skipPosCount--; + + // NOTE: the old API actually allowed this... + assert skipPosCount >= 0: "next() was called too many times (more than FormatPostingsDocsEnum.freq() times)"; + + if (DEBUG) + System.out.println(" proxFP=" + posIn.getFilePointer() + " return pos=" + position); + + return position; + } + + int getPayloadLength() { + return payloadLength; + } + + byte[] getPayload(byte[] data, int offset) throws IOException { + + if (!payloadPending) + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + + final byte[] retArray; + final int retOffset; + if (data == null || data.length-offset < payloadLength) { + // the array is too small to store the payload data, + // so we allocate a new one + retArray = new byte[payloadLength]; + retOffset = 0; + } else { + retArray = data; + retOffset = offset; + } + + payloadIn.readBytes(retArray, retOffset, payloadLength); + payloadPending = false; + return retArray; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatSepPositionsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermsProducer.java =================================================================== --- src/java/org/apache/lucene/index/TermsProducer.java (revision 0) +++ src/java/org/apache/lucene/index/TermsProducer.java (revision 0) @@ -0,0 +1,30 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsProducer { + /** Returns a "private" terms enumerator */ + abstract TermsEnum terms() throws IOException; +} + Index: src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfos.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -72,8 +72,12 @@ /** This format adds optional commit userData (String) storage. */ public static final int FORMAT_USER_DATA = -8; + /** Each segment records whether its postings are written + * in the new flex format */ + public static final int FORMAT_FLEX_POSTINGS = -9; + /* This must always point to the most recent file format. */ - static final int CURRENT_FORMAT = FORMAT_USER_DATA; + static final int CURRENT_FORMAT = FORMAT_FLEX_POSTINGS; public int counter = 0; // used to name new segments /** Index: src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java (working copy) @@ -134,6 +134,7 @@ */ long writeSkip(IndexOutput output) throws IOException { long skipPointer = output.getFilePointer(); + //System.out.println("skipper.writeSkip fp=" + skipPointer); if (skipBuffer == null || skipBuffer.length == 0) return skipPointer; for (int level = numberOfSkipLevels - 1; level > 0; level--) { Index: src/java/org/apache/lucene/index/DefaultSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/DefaultSkipListWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/DefaultSkipListWriter.java (working copy) @@ -35,7 +35,8 @@ private long[] lastSkipProxPointer; private IndexOutput freqOutput; - private IndexOutput proxOutput; + // nocommit -- private again + IndexOutput proxOutput; private int curDoc; private boolean curStorePayloads; @@ -81,6 +82,8 @@ Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); if (proxOutput != null) Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); + if (PostingsCodec.DEBUG) + System.out.println(" skip writer base freqFP=" + freqOutput.getFilePointer() + " proxFP=" + proxOutput.getFilePointer()); } protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { Index: src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- src/java/org/apache/lucene/index/CheckIndex.java (revision 718730) +++ src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -310,6 +310,8 @@ sFormat = "FORMAT_HAS_PROX [Lucene 2.4]"; else if (format == SegmentInfos.FORMAT_USER_DATA) sFormat = "FORMAT_USER_DATA [Lucene 2.9]"; + else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS) + sFormat = "FORMAT_FLEX_POSTINGS [Lucene 2.9]"; else if (format < SegmentInfos.CURRENT_FORMAT) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; @@ -428,7 +430,11 @@ // Used only to count up # deleted docs for this // term - final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); + final MySegmentTermDocs myTermDocs; + if (info.getFlexPostings()) + myTermDocs = null; + else + myTermDocs = new MySegmentTermDocs(reader); long termCount = 0; long totFreq = 0; @@ -464,17 +470,21 @@ // Now count how many deleted docs occurred in // this term: - final int delCount; - if (reader.hasDeletions()) { - myTermDocs.seek(term); - while(myTermDocs.next()) { - } - delCount = myTermDocs.delCount; - } else - delCount = 0; + + // nocommit -- do this check w/ flex postings too + if (!info.getFlexPostings()) { + final int delCount; + if (reader.hasDeletions()) { + myTermDocs.seek(term); + while(myTermDocs.next()) { + } + delCount = myTermDocs.delCount; + } else + delCount = 0; - if (freq0 + delCount != docFreq) - throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); + if (freq0 + delCount != docFreq) + throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); + } } msg("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]"); Index: src/java/org/apache/lucene/index/FormatSepPositionsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepPositionsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepPositionsWriter.java (revision 0) @@ -0,0 +1,156 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; + +final class FormatSepPositionsWriter extends PositionsConsumer { + + final static String CODEC = "SepPositionsPayloads"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final FormatSepDocsWriter parent; + final IndexOutput posOut; + final IndexOutput payloadOut; + + IndexOutput termsOut; + + boolean omitTF; + boolean storePayloads; + int lastPayloadLength = -1; + + // nocommit + String desc; + + FormatSepPositionsWriter(SegmentWriteState state, FormatSepDocsWriter parent) throws IOException { + this.parent = parent; + omitTF = parent.omitTF; + if (state.fieldInfos.hasProx()) { + // At least one field does not omit TF, so create the + // prox file + final String posFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.PROX_EXTENSION); + state.flushedFiles.add(posFileName); + posOut = state.directory.createOutput(posFileName); + + // nocommit -- only if at least one field stores payloads? + final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, "pyl"); + state.flushedFiles.add(payloadFileName); + payloadOut = state.directory.createOutput(payloadFileName); + + parent.skipListWriter.setPosOutput(posOut); + parent.skipListWriter.setPayloadOutput(payloadOut); + } else { + // Every field omits TF so we will write no prox file + posOut = null; + payloadOut = null; + } + } + + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + PostingsCodec.writeHeader(termsOut, CODEC, VERSION_CURRENT); + } + + long posStart; + long lastPosStart; + long payloadStart; + long lastPayloadStart; + + void startTerm() { + posStart = posOut.getFilePointer(); + payloadStart = payloadOut.getFilePointer(); + lastPayloadLength = -1; + } + + int lastPosition; + + /** Add a new position & payload */ + void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException { + assert !omitTF: "omitTF is true"; + assert posOut != null; + if (PostingsCodec.DEBUG) + if (payload != null) + System.out.println("pw.addPos [" + desc + "]: pos=" + position + " posFP=" + posOut.getFilePointer() + " payloadFP=" + payloadOut.getFilePointer() + " payload=" + payloadLength + " bytes"); + else + System.out.println("pw.addPos [" + desc + "]: pos=" + position + " posFP=" + posOut.getFilePointer() + " payloadFP=" + payloadOut.getFilePointer()); + + final int delta = position - lastPosition; + lastPosition = position; + + if (storePayloads) { + if (PostingsCodec.DEBUG) + System.out.println(" store payloads"); + if (payloadLength != lastPayloadLength) { + if (PostingsCodec.DEBUG) + System.out.println(" payload len change old=" + lastPayloadLength + " new=" + payloadLength); + lastPayloadLength = payloadLength; + posOut.writeVInt((delta<<1)|1); + posOut.writeVInt(payloadLength); + } else + posOut.writeVInt(delta << 1); + if (payloadLength > 0) + payloadOut.writeBytes(payload, payloadLength); + } else + posOut.writeVInt(delta); + } + + void setField(FieldInfo fieldInfo) { + omitTF = fieldInfo.omitTf; + storePayloads = omitTF ? false : fieldInfo.storePayloads; + } + + /** Called when we are done adding positions & payloads */ + void finishDoc() { + lastPosition = 0; + } + + void finishTerm(boolean isIndexTerm) throws IOException { + assert !omitTF; + + if (PostingsCodec.DEBUG) + System.out.println("poswriter finishTerm isIndex=" + isIndexTerm + " posStart=" + posStart + " pointer=" + termsOut.getFilePointer()); + + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(posStart); + termsOut.writeVLong(payloadStart); + } else { + termsOut.writeVLong(posStart-lastPosStart); + termsOut.writeVLong(payloadStart-lastPayloadStart); + } + + lastPosStart = posStart; + lastPayloadStart = payloadStart; + } + + void close() throws IOException { + try { + if (posOut != null) + posOut.close(); + } finally { + if (payloadOut != null) + payloadOut.close(); + } + } +} Property changes on: src/java/org/apache/lucene/index/FormatSepPositionsWriter.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SepCodec.java =================================================================== --- src/java/org/apache/lucene/index/SepCodec.java (revision 0) +++ src/java/org/apache/lucene/index/SepCodec.java (revision 0) @@ -0,0 +1,59 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; + +class SepCodec extends PostingsCodec { + + FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + DocsConsumer docsWriter = new FormatSepDocsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = new FormatPostingsTermsDictWriter(state, docsWriter); + success = true; + return ret; + } finally { + if (!success) + docsWriter.close(); + } + } + + FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize) throws IOException { + FormatPostingsTermsDictDocsReader docsReader = new FormatSepDocsReader(dir, si, readBufferSize); + boolean success = false; + try { + FieldsProducer ret = new FormatPostingsTermsDictReader(dir, fieldInfos, si.name, + docsReader, + readBufferSize); + success = true; + return ret; + } finally { + if (!success) + docsReader.close(); + } + } + + void files(SegmentInfo segmentInfo, Collection files) { + FormatSepDocsReader.files(segmentInfo, files); + FormatPostingsTermsDictReader.files(segmentInfo, files); + } +} Property changes on: src/java/org/apache/lucene/index/SepCodec.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPulsingDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPulsingDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPulsingDocsReader.java (revision 0) @@ -0,0 +1,281 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; +import org.apache.lucene.util.ArrayUtil; + +import org.apache.lucene.index.FormatPulsingDocsWriter.Document; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatPulsingDocsReader extends FormatPostingsTermsDictDocsReader { + + // Fallback reader for non-pulsed terms: + final FormatPostingsTermsDictDocsReader postingsDocsReader; + IndexInput termsIn; + int maxPulsingDocFreq; + + FormatPulsingDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, FormatPostingsTermsDictDocsReader postingsDocsReader) throws IOException { + this.postingsDocsReader = postingsDocsReader; + } + + static void files(SegmentInfo segmentInfo, Collection files) { + FormatPostingsDocsReader.files(segmentInfo, files); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + PostingsCodec.checkHeader(termsIn, FormatPulsingDocsWriter.CODEC, FormatPulsingDocsWriter.VERSION_START); + maxPulsingDocFreq = termsIn.readVInt(); + postingsDocsReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException { + return new TermsDictReader(fieldInfo, termsIn, postingsDocsReader.reader(fieldInfo, termsIn)); + } + + void close() throws IOException { + postingsDocsReader.close(); + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + final boolean omitTF; + final boolean storePayloads; + int docFreq; + + // Holds pulsed docs + final Document[] docs; + + private boolean pendingIndexTerm; + private final Reader postingsReader; + + TermsDictReader(FieldInfo fieldInfo, IndexInput termsIn, Reader postingsReader) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.postingsReader = postingsReader; + omitTF = fieldInfo.omitTf; + storePayloads = fieldInfo.storePayloads; + docs = new Document[maxPulsingDocFreq]; + for(int i=0;i>>1; + if ((code & 1) != 0) + doc.numPositions = 1; + else + doc.numPositions = termsIn.readVInt(); + + if (doc.numPositions > doc.positions.length) + doc.reallocPositions(doc.numPositions); + + int position = 0; + int payloadLength = -1; + + for(int j=0;j>> 1; + if ((code2 & 1) != 0) + payloadLength = termsIn.readVInt(); + if (payloadLength > 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + termsIn.readBytes(pos.payload, 0, payloadLength); + } + } else + position += code2; + pos.pos = position; + pos.payloadLength = payloadLength; + } + } + doc.docID = docID; + } + + } else { + if (PostingsCodec.DEBUG) + System.out.println(" not pulsed pass isIndex=" + pendingIndexTerm); + + postingsReader.readTerm(docFreq, pendingIndexTerm); + pendingIndexTerm = false; + } + } + + public void close() throws IOException { + postingsReader.close(); + } + + final PulsingDocsEnum docsEnum = new PulsingDocsEnum(); + + DocsEnum docs(BitVector deletedDocs) throws IOException { + if (docFreq <= maxPulsingDocFreq) { + docsEnum.reset(deletedDocs); + return docsEnum; + } else + return postingsReader.docs(deletedDocs); + } + + class PulsingDocsEnum extends DocsEnum { + int nextRead; + private BitVector deletedDocs; + private Document doc; + + void close() {} + + void reset(BitVector deletedDocs) { + this.deletedDocs = deletedDocs; + nextRead = 0; + } + + boolean next() { + while(true) { + if (nextRead >= docFreq) + return false; + else { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) + return true; + } + } + } + + int read(int[] retDocs, int[] retFreqs) { + final int limit; + int i=0; + // nocommit -- ob1? + while(nextRead < docFreq) { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) { + retDocs[i] = doc.docID; + if (omitTF) + retFreqs[i] = 0; + else + retFreqs[i] = doc.numPositions; + i++; + } + } + return i; + } + + int doc() { + assert doc.docID >= 0: "got docID=" + doc.docID; + return doc.docID; + } + + int ord() { + assert nextRead <= docFreq; + return nextRead-1; + } + + int freq() { + return doc.numPositions; + } + + class PulsingPositionsEnum extends PositionsEnum { + int nextRead; + FormatPulsingDocsWriter.Position pos; + + void reset() { + nextRead = 0; + } + + int next() { + assert nextRead < doc.numPositions; + pos = doc.positions[nextRead++]; + return pos.pos; + } + + int getPayloadLength() { + return pos.payloadLength; + } + + boolean hasPayload() { + return pos.payloadLength > 0; + } + + byte[] getPayload(byte[] data, int offset) { + // nocommit -- inefficient + System.arraycopy(pos.payload, 0, data, offset, pos.payloadLength); + return data; + } + } + + final PulsingPositionsEnum positions = new PulsingPositionsEnum(); + + PositionsEnum positions() throws IOException { + positions.reset(); + return positions; + } + + boolean skip(int target) throws IOException { + while(next()) { + if (doc() >= target) + return true; + } + return false; + } + } + } +} Index: src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) @@ -0,0 +1,277 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Consumes doc & freq, writing them using the current + * index file format */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexOutput; + +final class FormatPulsingDocsWriter extends DocsConsumer { + + final static String CODEC = "PulsedPostings"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + IndexOutput termsOut; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + FieldInfo fieldInfo; + + // nocommit + String desc; + + static class Document { + int docID; + int termDocFreq; + int numPositions; + Position[] positions; + Document() { + positions = new Position[1]; + positions[0] = new Position(); + } + + void reallocPositions(int minSize) { + final Position[] newArray = new Position[ArrayUtil.getNextSize(minSize)]; + System.arraycopy(positions, 0, newArray, 0, positions.length); + for(int i=positions.length;i maxPulsingDocFreq docs + + static class Position { + byte[] payload; + int pos; + int payloadLength; + } + + // nocommit -- lazy init this? ie, if every single term + // was pulsed then we never need to use this fallback? + // Fallback writer for non-pulsed terms: + final DocsConsumer postingsDocsWriter; + + /** If docFreq <= maxPulsingDocFreq, its postings are + * inlined into terms dict */ + + FormatPulsingDocsWriter(SegmentWriteState state, int maxPulsingDocFreq, DocsConsumer postingsDocsWriter) throws IOException { + super(); + + pendingDocs = new Document[maxPulsingDocFreq]; + for(int i=0;i 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + System.arraycopy(payload, payloadOffset, pos.payload, 0, payloadLength); + pos.payloadLength = payloadLength; + } else + pos.payloadLength = 0; + } + void finishDoc() { + assert currentDoc.numPositions == currentDoc.termDocFreq; + } + void finishTerm(boolean isIndexTerm) {} + void close() {} + } + + final PositionsWriter posWriter = new PositionsWriter(); + + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + + assert docID >= 0: "got docID=" + docID; + + if (PostingsCodec.DEBUG) + System.out.println("PW.addDoc: docID=" + docID + " pendingDocCount=" + pendingDocCount + " vs " + pendingDocs.length + " pulsed=" + pulsed); + + if (!pulsed && pendingDocCount == pendingDocs.length) { + + // OK we just crossed the threshold, this term should + // now be "pulsed" into the main postings codec: + postingsDocsWriter.startTerm(); + if (PostingsCodec.DEBUG) + System.out.println(" now flush buffer"); + for(int i=0;i= target */ + abstract boolean skip(int target) throws IOException; + + abstract boolean next() throws IOException; + + abstract int doc(); + + abstract int freq(); + + abstract int ord(); + + abstract int read(int[] docs, int[] freqs) throws IOException; + + // nocommit -- maybe move this up to TermsEnum? that + // would disallow changing positions format/reader of each + // doc, though + /** Don't call next() or skipTo() or read() until you're + * done consuming the positions */ + abstract PositionsEnum positions() throws IOException; + + abstract void close() throws IOException; +} Index: src/java/org/apache/lucene/index/PulsingCodec.java =================================================================== --- src/java/org/apache/lucene/index/PulsingCodec.java (revision 0) +++ src/java/org/apache/lucene/index/PulsingCodec.java (revision 0) @@ -0,0 +1,61 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; + +class PulsingCodec extends PostingsCodec { + + FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + DocsConsumer docsWriter = new FormatPostingsDocsWriter(state); + boolean success = false; + try { + DocsConsumer pulsingWriter = new FormatPulsingDocsWriter(state, 1, docsWriter); + FieldsConsumer ret = new FormatPostingsTermsDictWriter(state, pulsingWriter); + success = true; + return ret; + } finally { + if (!success) + docsWriter.close(); + } + } + + FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize) throws IOException { + FormatPostingsTermsDictDocsReader docs = new FormatPostingsDocsReader(dir, si, readBufferSize); + boolean success = false; + try { + FormatPostingsTermsDictDocsReader docsReader = new FormatPulsingDocsReader(dir, si, readBufferSize, docs); + FieldsProducer ret = new FormatPostingsTermsDictReader(dir, fieldInfos, si.name, + docsReader, + readBufferSize); + success = true; + return ret; + } finally { + if (!success) + docs.close(); + } + } + + void files(SegmentInfo segmentInfo, Collection files) { + FormatPulsingDocsReader.files(segmentInfo, files); + FormatPostingsTermsDictReader.files(segmentInfo, files); + } +} Property changes on: src/java/org/apache/lucene/index/PulsingCodec.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Abstract API that consumes terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" with the postings (write it into + * the index in a specific format). + * + * NOTE: this API is experimental and will likely change + */ +abstract class FieldsConsumer { + + /** Add a new field */ + abstract TermsConsumer addField(FieldInfo field) throws IOException; + + /** Called when we are done adding everything. */ + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FieldsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** Abstract API that consumes terms, doc, freq, prox and - * payloads postings. Concrete implementations of this - * actually do "something" with the postings (write it into - * the index in a specific format). - * - * NOTE: this API is experimental and will likely change - */ -abstract class FormatPostingsFieldsConsumer { - - /** Add a new field */ - abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; - - /** Called when we are done adding everything. */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatSepDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepDocsReader.java (revision 0) @@ -0,0 +1,446 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatSepDocsReader extends FormatPostingsTermsDictDocsReader { + + final IndexInput freqIn; + final IndexInput docIn; + final IndexInput skipIn; + + IndexInput termsIn; + + private final FormatSepPositionsReader posReader; + + int skipInterval; + int maxSkipLevels; + + FormatSepDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + + boolean success = false; + try { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION), readBufferSize); + docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "doc"), readBufferSize); + skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "skp"), readBufferSize); + if (segmentInfo.getHasProx()) + posReader = new FormatSepPositionsReader(dir, segmentInfo, readBufferSize); + else + posReader = null; + success = true; + } finally { + if (!success) + close(); + } + } + + static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "doc")); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "skp")); + FormatSepPositionsReader.files(segmentInfo, files); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // Make sure we are talking to the matching past writer + PostingsCodec.checkHeader(termsIn, FormatSepDocsWriter.CODEC, FormatSepPositionsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + if (posReader != null) + posReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + + final FormatSepPositionsReader.TermsDictReader posReader2; + if (posReader != null && !fieldInfo.omitTf) + posReader2 = (FormatSepPositionsReader.TermsDictReader) posReader.reader(fieldInfo, termsIn); + else + posReader2 = null; + + return new TermsDictReader(fieldInfo, posReader2, termsIn); + } + + void close() throws IOException { + try { + if (freqIn != null) + freqIn.close(); + } finally { + try { + if (docIn != null) + docIn.close(); + } finally { + try { + if (skipIn != null) + skipIn.close(); + } finally { + if (posReader != null) + posReader.close(); + } + } + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long freqOffset; + long docOffset; + long skipOffset; + int docFreq; + + // TODO: abstraction violation (we are storing this with + // the concrete impl. as the type, not the abstract base + // class) + final FormatSepPositionsReader.TermsDictReader posReader; + private SegmentDocsEnum docs; + + TermsDictReader(FieldInfo fieldInfo, FormatSepPositionsReader.TermsDictReader posReader, IndexInput termsIn) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.posReader = posReader; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + + this.docFreq = docFreq; + if (PostingsCodec.DEBUG) { + System.out.println(" dr.readTerm termsFP=" + termsIn.getFilePointer() + " df=" + docFreq + " isIndex=" + isIndexTerm); + System.out.println(" start freqFP=" + freqOffset + " docFP=" + docOffset + " skipFP=" + skipOffset); + } + + if (isIndexTerm) { + freqOffset = termsIn.readVLong(); + docOffset = termsIn.readVLong(); + skipOffset = termsIn.readVLong(); + } else { + freqOffset += termsIn.readVLong(); + docOffset += termsIn.readVLong(); + if (docFreq >= skipInterval) + skipOffset += termsIn.readVLong(); + } + + if (PostingsCodec.DEBUG) + System.out.println(" freqFP=" + freqOffset + " docFP=" + docOffset + " skipFP=" + skipOffset); + + if (posReader != null) + posReader.readTerm(docFreq, isIndexTerm); + } + + public void close() throws IOException { + if (posReader != null) + posReader.close(); + } + + DocsEnum docs(BitVector deletedDocs) throws IOException { + + if (docs == null) + // Lazy init + docs = new SegmentDocsEnum(); + + docs.init(deletedDocs); + + return docs; + } + + class SegmentDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long freqStart; + final IndexInput freqIn; + final IndexInput docIn; + + // nocommit -- should we do omitTF with 2 different enum classes? + final boolean omitTF; + private BitVector deletedDocs; + + // nocommit -- should we do hasProx with 2 different enum classes? + + boolean skipped; + SepSkipListReader skipper; + + // TODO: abstraction violation: we are storing the + // concrete impl, not the abstract base class + FormatSepPositionsReader.TermsDictReader.SegmentPositionsEnum positions; + + SegmentDocsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new docs enum"); + + this.docIn = (IndexInput) FormatSepDocsReader.this.docIn.clone(); + omitTF = fieldInfo.omitTf; + if (!omitTF) + this.freqIn = (IndexInput) FormatSepDocsReader.this.freqIn.clone(); + else { + this.freqIn = null; + freq = 1; + } + } + + void close() {} + + void init(BitVector deletedDocs) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("[" + desc + "] dr.init freqIn seek " + freqOffset + " this=" + this + " (in=" + freqIn + "; this=" + this + ")"); + this.deletedDocs = deletedDocs; + docIn.seek(docOffset); + if (!omitTF) + freqIn.seek(freqOffset); + this.docFreq = TermsDictReader.this.docFreq; + count = 0; + doc = 0; + skipped = false; + proxSkipFreq = 0; + + // maybe not necessary? + proxSkipPayloadLength = -1; + + // TODO: abstraction violation + if (posReader != null) { + posOffset = posReader.posOffset; + payloadOffset = posReader.payloadOffset; + } + } + + boolean next() throws IOException { + + if (PostingsCodec.DEBUG) { + + if (!omitTF) + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " freqFP=" + freqIn.getFilePointer() + " docFP=" + docIn.getFilePointer() + " deletes?=" + (deletedDocs != null) ); + else + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " docFP=" + docIn.getFilePointer() + " deletes?=" + (deletedDocs != null) ); + } + + // new Throwable().printStackTrace(System.out); + + while(true) { + if (count == docFreq) + return false; + + count++; + + // Decode next doc + doc += docIn.readVInt(); + + if (!omitTF) { + freq = freqIn.readVInt(); + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) + break; + else if (PostingsCodec.DEBUG) + System.out.println(" doc=" + doc + " is deleted"); + } + + // nocommit + if (PostingsCodec.DEBUG) { + if (positions != null) + positions.desc = desc + ":" + doc; + System.out.println(" return doc=" + doc); + } + return true; + } + + int read(int[] docs, int[] freqs) throws IOException { + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + doc += docIn.readVInt(); + if (!omitTF) { + freq = freqIn.readVInt(); + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + i++; + } + } + + return i; + } + + int doc() { + return doc; + } + + int ord() { + assert count > 0; + return count-1; + } + + int freq() { + return freq; + } + + long posOffset; + long payloadOffset; + int proxSkipPayloadLength = -1; + int proxSkipFreq; + PositionsEnum fakePositions; + + PositionsEnum positions() throws IOException { + if (positions == null) { + // Lazy init + if (posReader == null) { + // TermFreq was omitted from this field during + // indexing, which means we pretend termFreq is + // always 1 with that 1 occurrence having + // position 0 + if (fakePositions == null) + fakePositions = new FakePositionsEnum(); + return fakePositions; + } else { + // TODO: abstraction violation + positions = (FormatSepPositionsReader.TermsDictReader.SegmentPositionsEnum) posReader.positions(); + if (PostingsCodec.DEBUG) + System.out.println("pos skip posOffset=" + posOffset + " payloadlen=" + proxSkipPayloadLength + " skipPosCount= " + proxSkipFreq); + positions.skip(posOffset, payloadOffset, proxSkipPayloadLength, proxSkipFreq); + } + } + + if (PostingsCodec.DEBUG) + positions.desc = desc + ":" + doc; + + positions.catchUp(freq); + + return positions; + } + + boolean skip(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skip to target=" + target); + + if (docFreq >= skipInterval) { + + // There are enough docs in the posting to have + // skip data + if (skipper == null) + // Lazy init + skipper = new SepSkipListReader((IndexInput) skipIn.clone(), maxSkipLevels, skipInterval); + + if (!skipped) { + + // We haven't already skipped for this posting, + // so now we init the skipper + + // TODO: this is abstraction violation; instead, + // skipper should interact with this as a + // private consumer + skipper.init(skipOffset, + docOffset, freqOffset, posOffset, payloadOffset, + docFreq, fieldInfo.storePayloads); + + if (PostingsCodec.DEBUG) + System.out.println(" skip reader base skipFP=" + skipOffset + " docFP=" + docOffset + " freqFP=" + freqOffset + " proxFP=" + posOffset + " payloadFP=" + payloadOffset); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skipper moved to newCount=" + newCount + " docFP=" + skipper.getDocPointer() + " freqFP=" + skipper.getFreqPointer() + " posFP=" + skipper.getPosPointer() + " payloadFP=" + skipper.getPayloadPointer() + " doc=" + skipper.getDoc()); + + // Skipper did move + if (!omitTF) + freqIn.seek(skipper.getFreqPointer()); + docIn.seek(skipper.getDocPointer()); + count = newCount; + doc = skipper.getDoc(); + + // TODO: abstraction violation; this should be a + // private interaction b/w skipper & posReader + if (positions != null) + // nocommit -- should that be count? + positions.skip(skipper.getPosPointer(), skipper.getPayloadPointer(), skipper.getPayloadLength(), 0); + else { + posOffset = skipper.getPosPointer(); + payloadOffset = skipper.getPayloadPointer(); + proxSkipPayloadLength = skipper.getPayloadLength(); + // nocommit -- should that be count? + proxSkipFreq = 0; + } + } else if (PostingsCodec.DEBUG) + System.out.println(" no skipping to be done"); + } + + // Now, linear scan for the rest: + do { + if (!next()) + return false; + } while (target > doc); + + return true; + } + } + } +} + +/** Returned when someone asks for positions() enum on field + * with omitTf true */ +class FakePositionsEnum extends PositionsEnum { + int next() { + return 0; + } + int getPayloadLength() { + return 0; + } + boolean hasPayload() { + return false; + } + byte[] getPayload(byte[] data, int offset) { + return null; + } + } Property changes on: src/java/org/apache/lucene/index/FormatSepDocsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -76,10 +77,14 @@ SegmentMerger(Directory dir, String name) { directory = dir; segment = name; + codec = PostingsCodec.getCodec(); } - SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { + private final PostingsCodec codec; + + SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge, PostingsCodec codec) { directory = writer.getDirectory(); + this.codec = codec; segment = name; if (merge != null) checkAbort = new CheckAbort(merge, directory); @@ -164,21 +169,27 @@ CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); - List files = - new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); - + List files = new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); + // Basic files - for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { - String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; + for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC.length; i++) { + String ext = IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC[i]; + /* if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) continue; + */ if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) files.add(segment + "." + ext); } + // nocommit -- not clean + SegmentInfo info = new SegmentInfo(segment, mergedDocs, directory); + info.setHasProx(fieldInfos.hasProx()); + codec.files(info, files); + // Fieldable norm files for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); @@ -481,24 +492,24 @@ private final void mergeTerms() throws CorruptIndexException, IOException { - SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + SegmentWriteState state = new SegmentWriteState(null, directory, segment, fieldInfos, null, mergedDocs, 0, termIndexInterval, codec); + + final FieldsConsumer consumer = codec.fieldsConsumer(state); - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); - try { queue = new SegmentMergeQueue(readers.size()); mergeTermInfos(consumer); } finally { - consumer.finish(); + consumer.close(); if (queue != null) queue.close(); } } boolean omitTF; - private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { + private final void mergeTermInfos(final FieldsConsumer consumer) throws CorruptIndexException, IOException { int base = 0; final int readerCount = readers.size(); for (int i = 0; i < readerCount; i++) { @@ -525,7 +536,7 @@ SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; String currentField = null; - FormatPostingsTermsConsumer termsConsumer = null; + TermsConsumer termsConsumer = null; while (queue.size() > 0) { int matchSize = 0; // pop matching terms @@ -572,6 +583,8 @@ return delCounts; } + private char[] termBuffer; + /** Process postings from multiple segments all positioned on the * same term. Writes out merged entries into freqOutput and * the proxOutput streams. @@ -582,10 +595,17 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) + private final int appendPostings(final TermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) throws CorruptIndexException, IOException { - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text); + final String text = smis[0].term.text; + final int len = text.length(); + if (termBuffer == null || termBuffer.length < 1+len) + termBuffer = new char[ArrayUtil.getNextSize(1+len)]; + text.getChars(0, len, termBuffer, 0); + termBuffer[len] = 0xffff; + + final DocsConsumer docConsumer = termsConsumer.startTerm(termBuffer, 0); int df = 0; for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; @@ -598,13 +618,18 @@ while (postings.next()) { df++; int doc = postings.doc(); - if (docMap != null) + if (docMap != null) { doc = docMap[doc]; // map around deletions + assert doc != -1: "postings enum returned deleted docID " + postings.doc() + " freq=" + postings.freq() + " df=" + df; + } doc += base; // convert to merged space final int freq = postings.freq(); - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + final PositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + // nocommit -- omitTF should be "private", and this + // code (and FreqProxTermsWriter) should instead + // check if posConsumer is null? if (!omitTF) { for (int j = 0; j < freq; j++) { final int position = postings.nextPosition(); @@ -616,12 +641,13 @@ } posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } - posConsumer.finish(); + posConsumer.finishDoc(); } } } - docConsumer.finish(); + termsConsumer.finishTerm(termBuffer, 0, df); + return df; } Index: src/java/org/apache/lucene/index/FormatSepDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepDocsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepDocsWriter.java (revision 0) @@ -0,0 +1,231 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.store.IndexOutput; + +/** Writes frq to .frq, docs to .doc, pos to .pos, paylaods + * to .pyl, skip data inlined into terms dict */ + +final class FormatSepDocsWriter extends DocsConsumer { + final static String CODEC = "SepDocFreqSkip"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final IndexOutput freqOut; + final IndexOutput docOut; + final IndexOutput skipOut; + IndexOutput termsOut; + + final FormatSepPositionsWriter posWriter; + final SepSkipListWriter skipListWriter; + final int skipInterval; + final int maxSkipLevels; + final int totalNumDocs; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + long lastFreqStart; + long freqStart; + long lastDocStart; + long docStart; + long lastSkipStart; + + FieldInfo fieldInfo; + + FormatSepDocsWriter(SegmentWriteState state) throws IOException { + super(); + + final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, "frq"); + state.flushedFiles.add(frqFileName); + freqOut = state.directory.createOutput(frqFileName); + + final String docFileName = IndexFileNames.segmentFileName(state.segmentName, "doc"); + state.flushedFiles.add(docFileName); + docOut = state.directory.createOutput(docFileName); + + final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, "skp"); + state.flushedFiles.add(skipFileName); + skipOut = state.directory.createOutput(skipFileName); + + if (PostingsCodec.DEBUG) + System.out.println("dw.init: create frq=" + frqFileName + " doc=" + docFileName + " skip=" + skipFileName); + + totalNumDocs = state.numDocs; + + // nocommit -- abstraction violation + skipListWriter = new SepSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + freqOut, docOut, + null, null); + + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + + posWriter = new FormatSepPositionsWriter(state, this); + } + + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + PostingsCodec.writeHeader(termsOut, CODEC, VERSION_CURRENT); + // nocommit -- just ask skipper to "start" here + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + posWriter.start(termsOut); + } + + void startTerm() { + freqStart = freqOut.getFilePointer(); + docStart = docOut.getFilePointer(); + if (!omitTF) + posWriter.startTerm(); + skipListWriter.resetSkip(); + } + + // nocommit -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + void setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTf; + storePayloads = fieldInfo.storePayloads; + posWriter.setField(fieldInfo); + } + + int lastDocID; + int df; + + int count; + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + + final int delta = docID - lastDocID; + + if (PostingsCodec.DEBUG) + System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq + " freqPointer=" + freqOut.getFilePointer()); + + if (docID < 0 || (df > 0 && delta <= 0)) + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + + if ((++df % skipInterval) == 0) { + // TODO: abstraction violation + // nocommit -- awkward we have to make these two + // separate calls to skipper + skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); + skipListWriter.bufferSkip(df); + if (PostingsCodec.DEBUG) + System.out.println(" bufferSkip lastDocID=" + lastDocID + " df=" + df + " freqFP=" + freqOut.getFilePointer() + " docFP=" + docOut.getFilePointer() + " posFP=" + skipListWriter.posOutput.getFilePointer() + " payloadFP=" + skipListWriter.payloadOutput.getFilePointer() + " payloadLen=" + posWriter.lastPayloadLength); + } + + // nocommit -- move this assert up above; every consumer + // shouldn't have to check for this bug: + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + lastDocID = docID; + docOut.writeVInt(delta); + if (!omitTF) + freqOut.writeVInt(termDocFreq); + + // nocommit + if (PostingsCodec.DEBUG) + ((FormatSepPositionsWriter) posWriter).desc = desc + ":" + docID; + + if (omitTF) + return null; + else + return posWriter; + } + + /** Called when we are done adding docs to this term */ + void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + + long skipPos = skipOut.getFilePointer(); + + // nocommit -- wasteful we are counting this in two places? + assert docCount == df; + if (PostingsCodec.DEBUG) + System.out.println("dw.finishTerm termsFP=" + termsOut.getFilePointer() + " freqStart=" + freqStart + " df=" + df + " skipPos=" + skipPos); + + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(freqStart); + termsOut.writeVLong(docStart); + } else { + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); + termsOut.writeVLong(docStart - lastDocStart); + } + + if (df >= skipInterval) { + if (PostingsCodec.DEBUG) + System.out.println(" writeSkip @ docFp=" + docOut.getFilePointer() + " freqFP=" + freqOut.getFilePointer() + " freqStartFP=" + freqStart + " skipPos=" + skipPos + " lastSkipPos=" + lastSkipStart); + + skipListWriter.writeSkip(skipOut); + } + + if (isIndexTerm) { + termsOut.writeVLong(skipPos); + lastSkipStart = skipPos; + } else if (df >= skipInterval) { + termsOut.writeVLong(skipPos-lastSkipStart); + lastSkipStart = skipPos; + } + + lastFreqStart = freqStart; + lastDocStart = docStart; + + if (!omitTF) + posWriter.finishTerm(isIndexTerm); + + lastDocID = 0; + df = 0; + + // nocommit + count = 0; + } + + void close() throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("dw.close freqFP=" + freqOut.getFilePointer() + " docFP=" + docOut.getFilePointer() + " skipFP=" + skipOut.getFilePointer()); + try { + freqOut.close(); + } finally { + try { + docOut.close(); + } finally { + try { + skipOut.close(); + } finally { + posWriter.close(); + } + } + } + } +} Property changes on: src/java/org/apache/lucene/index/FormatSepDocsWriter.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -88,21 +88,23 @@ } } + final int numAllFields = allFields.size(); + // Sort by field name Collections.sort(allFields); - final int numAllFields = allFields.size(); - // TODO: allow Lucene user to customize this consumer: - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + // TODO: allow Lucene user to customize this codec: + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + /* Current writer chain: - FormatPostingsFieldsConsumer - -> IMPL: FormatPostingsFieldsWriter - -> FormatPostingsTermsConsumer - -> IMPL: FormatPostingsTermsWriter - -> FormatPostingsDocConsumer - -> IMPL: FormatPostingsDocWriter - -> FormatPostingsPositionsConsumer + FieldsConsumer + -> IMPL: FormatPostingsTermsDictWriter + -> TermsConsumer + -> IMPL: FormatPostingsTermsDictWriter.TermsWriter + -> DocsConsumer + -> IMPL: FormatPostingsDocsWriter + -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ @@ -145,8 +147,7 @@ FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } - - consumer.finish(); + consumer.close(); } private byte[] payloadBuffer; @@ -155,7 +156,7 @@ * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, - FormatPostingsFieldsConsumer consumer) + FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; @@ -172,7 +173,7 @@ assert result; } - final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); + final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; @@ -196,11 +197,15 @@ termStates[numToMerge++] = mergeStates[i]; } - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset); + final char[] termText = termStates[0].text; + final int termTextOffset = termStates[0].textOffset; + final DocsConsumer docConsumer = termsConsumer.startTerm(termText, termTextOffset); + // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. + int numDocs = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -209,8 +214,9 @@ minState = termStates[i]; final int termDocFreq = minState.termFreq; + numDocs++; - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); + final PositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); final ByteSliceReader prox = minState.prox; @@ -241,7 +247,7 @@ posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } //End for - posConsumer.finish(); + posConsumer.finishDoc(); } if (!minState.nextDoc()) { @@ -269,7 +275,7 @@ } } - docConsumer.finish(); + termsConsumer.finishTerm(termText, termTextOffset, numDocs); } termsConsumer.finish(); Index: src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- src/java/org/apache/lucene/index/TermsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/TermsEnum.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.BitVector; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsEnum { + + // nocommit -- char[] or byte[] version? + /** Seeks to the specified term. Returns true if the term + * exists. */ + abstract boolean seek(String text) throws IOException; + + /** Increments the enumeration to the next element. True if one exists.*/ + abstract boolean next() throws IOException; + + // nocommit -- char[] or byte[] version? + /** Returns the text for current Term in the enumeration.*/ + abstract String text(); + + /** Returns the docFreq of the current Term in the enumeration.*/ + abstract int docFreq(); + + /** Get DocsEnum for the current term. You should not + * call {@link #next()} or {@link #seek()} until you're + * done using the DocsEnum. */ + abstract DocsEnum docs(BitVector deletedDocs) throws IOException; + + /** Closes the enumeration to further activity, freeing resources. */ + abstract void close() throws IOException; +} + Index: src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (working copy) @@ -1,71 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { - - final FormatPostingsFieldsWriter parent; - final FormatPostingsDocsWriter docsWriter; - final TermInfosWriter termsOut; - FieldInfo fieldInfo; - - FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { - super(); - this.parent = parent; - termsOut = parent.termsOut; - docsWriter = new FormatPostingsDocsWriter(state, this); - } - - void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - docsWriter.setField(fieldInfo); - } - - char[] currentTerm; - int currentTermStart; - - long freqStart; - long proxStart; - - /** Adds a new term in this field */ - FormatPostingsDocsConsumer addTerm(char[] text, int start) { - currentTerm = text; - currentTermStart = start; - - // TODO: this is abstraction violation -- ideally this - // terms writer is not so "invasive", looking for file - // pointers in its child consumers. - freqStart = docsWriter.out.getFilePointer(); - if (docsWriter.posWriter.out != null) - proxStart = docsWriter.posWriter.out.getFilePointer(); - - parent.skipListWriter.resetSkip(); - - return docsWriter; - } - - /** Called when we are done adding terms to this field */ - void finish() { - } - - void close() throws IOException { - docsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FieldsProducer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Abstract API that provides terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" to read the postings from some + * store. + * + * NOTE: this API is experimental and will likely change + */ + +abstract class FieldsProducer { + // TODO: field iteration API? + abstract TermsProducer getField(FieldInfo fieldInfo) throws IOException; + abstract void close() throws IOException; +} Index: src/java/org/apache/lucene/index/TermInfo.java =================================================================== --- src/java/org/apache/lucene/index/TermInfo.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfo.java (working copy) @@ -17,7 +17,10 @@ * limitations under the License. */ -/** A TermInfo is the record of information stored for a term.*/ +/** A TermInfo is the record of information stored for a + * term + * @deprecated This class is no longer used in flexible + * indexing. */ final class TermInfo { /** The number of documents which contain the term. */ Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) @@ -0,0 +1,42 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; + +// nocommit -- bad name: this class never interacts directly +// w/ termsdict +abstract class FormatPostingsTermsDictPositionsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a pos enum for the last term read */ + abstract PositionsEnum positions() throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -27,8 +27,9 @@ /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the - * set. */ - + * set + * @deprecated This class has been replaced by + * FormatPostingsTermsDictReader, except for reading old segments. */ final class TermInfosReader { private Directory directory; private String segment; Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -531,7 +531,9 @@ synchronized private void initFlushState(boolean onlyDocStore) { initSegmentName(onlyDocStore); - flushState = new SegmentWriteState(this, directory, segment, docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval()); + flushState = new SegmentWriteState(this, directory, segment, docFieldProcessor.fieldInfos, + docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval(), + writer.codec); } /** Flush all pending docs to a new segment */ @@ -599,8 +601,12 @@ CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); Iterator it = flushState.flushedFiles.iterator(); - while(it.hasNext()) - cfsWriter.addFile((String) it.next()); + while(it.hasNext()) { + final String fileName = (String) it.next(); + if (PostingsCodec.DEBUG) + System.out.println("make cfs " + fileName); + cfsWriter.addFile(fileName); + } // Perform the merge cfsWriter.close(); @@ -974,14 +980,17 @@ // Delete by query IndexSearcher searcher = new IndexSearcher(reader); iter = deletesFlushed.queries.entrySet().iterator(); + //System.out.println("DW: flush delete by query"); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Query query = (Query) entry.getKey(); + //System.out.println("\n del query=" + query.toString()); int limit = ((Integer) entry.getValue()).intValue(); Weight weight = query.weight(searcher); Scorer scorer = weight.scorer(reader); while(scorer.next()) { final int docID = scorer.doc(); + //System.out.println(" del docID=" + docID); if (docIDStart + docID >= limit) break; reader.deleteDocument(docID); Index: src/java/org/apache/lucene/index/TermInfosWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy) @@ -24,8 +24,10 @@ import org.apache.lucene.util.UnicodeUtil; /** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - + Directory. A TermInfos can be written once, in order. + * + * @deprecated This class has been replaced by + * FormatPostingsTermsDictWriter. */ final class TermInfosWriter { /** The file format version, a negative number. */ public static final int FORMAT = -3; @@ -36,193 +38,4 @@ // NOTE: always change this if you switch to a new format! public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); - add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - byte[] newArray = new byte[(int) (termBytesLength*1.5)]; - System.arraycopy(lastTermBytes, 0, newArray, 0, start); - lastTermBytes = newArray; - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - } Index: src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (working copy) @@ -1,32 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.IndexInput; - -abstract class FormatPostingsPositionsConsumer { - - /** Add a new position & payload. If payloadLength > 0 - * you must read those bytes from the IndexInput. */ - abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; - - /** Called when we are done adding positions & payloads */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/PositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) @@ -0,0 +1,43 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +abstract class PositionsConsumer { + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Add a new position & payload. If payloadLength > 0 + * you must read those bytes from the IndexInput. NOTE: + * you must fully consume the byte[] payload, since + * caller is free to reuse it on subsequent calls. */ + abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; + + /** Called when we are done adding positions & payloads + * for each doc */ + abstract void finishDoc() throws IOException; + + abstract void finishTerm(boolean isIndexTerm) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/PositionsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -307,6 +307,8 @@ // LUCENE-888 for details. private final static int MERGE_READ_BUFFER_SIZE = 4096; + PostingsCodec codec = PostingsCodec.getCodec(); + // Used for printing messages private static Object MESSAGE_ID_LOCK = new Object(); private static int MESSAGE_ID = 0; @@ -1205,7 +1207,7 @@ // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, infoStream, docWriter); + segmentInfos, infoStream, docWriter, codec); if (deleter.startingCommitDeleted) // Deletion policy deleted the "head" commit point. @@ -3389,9 +3391,11 @@ // hits an exception it will release the write lock: startTransaction(true); + success = false; + try { mergedName = newSegmentName(); - merger = new SegmentMerger(this, mergedName, null); + merger = new SegmentMerger(this, mergedName, null, codec); IndexReader sReader = null; synchronized(this) { @@ -4471,7 +4475,7 @@ if (infoStream != null) message("merging " + merge.segString(directory)); - merger = new SegmentMerger(this, mergedName, merge); + merger = new SegmentMerger(this, mergedName, merge, codec); boolean success = false; Index: src/java/org/apache/lucene/index/PostingsCodec.java =================================================================== --- src/java/org/apache/lucene/index/PostingsCodec.java (revision 0) +++ src/java/org/apache/lucene/index/PostingsCodec.java (revision 0) @@ -0,0 +1,71 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +abstract class PostingsCodec { + + static boolean DEBUG = false; + + static final int CODEC_HEADER = 0x1af65; + + /** Writes a new segment */ + abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; + + /** Reads a segment */ + abstract FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize) throws IOException; + + /** Gathers files associated with this segment */ + abstract void files(SegmentInfo segmentInfo, Collection files); + + /** Get default codec */ + static PostingsCodec getCodec() { + // nocommit: put current codec here + //return new PulsingCodec(); + return new SepCodec(); + //return new DefaultCodec(); + } + + static void checkHeader(IndexInput in, String codec, int version) throws IOException { + + // Safety to guard against reading a bogus string: + int header = in.readVInt(); + if (header != CODEC_HEADER) + throw new CorruptIndexException("codec header mismatch"); + + final String actualCodec = in.readString(); + if (!codec.equals(actualCodec)) + throw new CorruptIndexException("codec mismatch: expected '" + codec + "' but got '" + actualCodec + "'"); + + int actualVersion = in.readVInt(); + if (actualVersion > version) + throw new CorruptIndexException("version '" + actualVersion + "' is too new (expected <= '" + version + "'"); + } + + static void writeHeader(IndexOutput out, String codec, int version) throws IOException { + out.writeVInt(CODEC_HEADER); + out.writeString(codec); + out.writeVInt(version); + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/PostingsCodec.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexFileDeleter.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileDeleter.java (revision 718730) +++ src/java/org/apache/lucene/index/IndexFileDeleter.java (working copy) @@ -104,6 +104,7 @@ private Directory directory; private IndexDeletionPolicy policy; private DocumentsWriter docWriter; + private PostingsCodec codec; final boolean startingCommitDeleted; @@ -129,7 +130,8 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter) + public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter, + PostingsCodec codec) throws CorruptIndexException, IOException { this.docWriter = docWriter; @@ -140,6 +142,7 @@ this.policy = policy; this.directory = directory; + this.codec = codec; // First pass: walk the files and initialize our ref // counts: @@ -187,7 +190,7 @@ sis = null; } if (sis != null) { - CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis); + CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis, codec); if (sis.getGeneration() == segmentInfos.getGeneration()) { currentCommitPoint = commitPoint; } @@ -215,7 +218,7 @@ } if (infoStream != null) message("forced open of current segments file " + segmentInfos.getCurrentSegmentFileName()); - currentCommitPoint = new CommitPoint(commitsToDelete, directory, sis); + currentCommitPoint = new CommitPoint(commitsToDelete, directory, sis, codec); commits.add(currentCommitPoint); incRef(sis, true); } @@ -399,7 +402,7 @@ if (isCommit) { // Append to our commits list: - commits.add(new CommitPoint(commitsToDelete, directory, segmentInfos)); + commits.add(new CommitPoint(commitsToDelete, directory, segmentInfos, codec)); // Tell policy so it can remove commits: policy.onCommit(commits); @@ -432,7 +435,7 @@ for(int i=0;i