Index: src/test/org/apache/lucene/store/MockRAMInputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 747337) +++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy) @@ -1,7 +1,5 @@ package org.apache.lucene.store; -import java.io.IOException; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ * limitations under the License. */ +import java.io.IOException; + /** * Used by MockRAMDirectory to create an input stream that * keeps track of when it's been closed. @@ -44,16 +44,8 @@ // all clones get closed: if (!isClone) { synchronized(dir.openFiles) { - Integer v = (Integer) dir.openFiles.get(name); - // Could be null when MockRAMDirectory.crash() was called - if (v != null) { - if (v.intValue() == 1) { - dir.openFiles.remove(name); - } else { - v = new Integer(v.intValue()-1); - dir.openFiles.put(name, v); - } - } + assert dir.openFiles.containsKey(this): "input=" + name + " is not open"; + dir.openFiles.remove(this); } } } Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 747337) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -208,9 +208,11 @@ if (crashed) throw new IOException("cannot createOutput after crash"); init(); - synchronized(openFiles) { + synchronized(this) { if (preventDoubleWrite && createdFiles.contains(name) && !name.equals("segments.gen")) throw new IOException("file \"" + name + "\" was already written to"); + } + synchronized(openFiles) { if (noDeleteOpenFile && openFiles.containsKey(name)) throw new IOException("MockRAMDirectory: file \"" + name + "\" is still open: cannot overwrite"); } @@ -237,6 +239,15 @@ return new MockRAMOutputStream(this, file); } + static class OpenFile { + final String name; + final Throwable stack; + OpenFile(String name) { + this.name = name; + this.stack = new Throwable(); + } + } + public IndexInput openInput(String name) throws IOException { RAMFile file; synchronized (this) { @@ -245,17 +256,12 @@ if (file == null) throw new FileNotFoundException(name); else { + IndexInput in = new MockRAMInputStream(this, name, file); synchronized(openFiles) { - if (openFiles.containsKey(name)) { - Integer v = (Integer) openFiles.get(name); - v = new Integer(v.intValue()+1); - openFiles.put(name, v); - } else { - openFiles.put(name, new Integer(1)); - } + openFiles.put(in, new OpenFile(name)); } + return in; } - return new MockRAMInputStream(this, name, file); } /** Provided for testing purposes. Use sizeInBytes() instead. */ @@ -289,7 +295,14 @@ if (noDeleteOpenFile && openFiles.size() > 0) { // RuntimeException instead of IOException because // super() does not throw IOException currently: - throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles); + Iterator it = openFiles.values().iterator(); + System.out.println("\nMockRAMDirectory open files:"); + while(it.hasNext()) { + OpenFile openFile = (OpenFile) it.next(); + System.out.println("\nfile " + openFile.name + " opened from:\n"); + openFile.stack.printStackTrace(System.out); + } + throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files"); } } } Index: src/test/org/apache/lucene/search/TestSort.java =================================================================== --- src/test/org/apache/lucene/search/TestSort.java (revision 747337) +++ src/test/org/apache/lucene/search/TestSort.java (working copy) @@ -870,7 +870,7 @@ //ScoreDoc[] result = searcher.search (query, null, 1000, sort).scoreDocs; TopDocs hits = searcher.search (query, null, expectedResult.length(), sort); ScoreDoc[] result = hits.scoreDocs; - assertEquals(hits.totalHits, expectedResult.length()); + assertEquals(expectedResult.length(), hits.totalHits); StringBuffer buff = new StringBuffer(10); int n = result.length; for (int i=0; i 1 level skipping +// - test all combinations of payloads/not and omitTF/not +// - test w/ different indexDivisor +// - test field where payload length rarely changes +// - 0-term fields +// - seek/skip to same term/doc i'm already on +// - mix in deleted docs +// - seek, skip beyond end -- assert returns false +// - seek, skip to things that don't exist -- ensure it +// goes to 1 before next one known to exist +// - skipTo(term) +// - skipTo(doc) + +public class TestFormatPostings extends LuceneTestCase { + + private static final Random RANDOM = new Random(42); + private static String[] fieldNames = new String[] {"one", "two", "three", "four"}; + + private final static int NUM_TEST_ITER = 4000; + private final static int NUM_TEST_THREADS = 3; + private final static int NUM_FIELDS = 4; + private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping + private final static int DOC_FREQ_RAND = 500; // must be > 16 to test skipping + private final static int TERM_DOC_FREQ_RAND = 20; + + // start is inclusive and end is exclusive + public int nextInt(int start, int end) { + return start + RANDOM.nextInt(end-start); + } + + private int nextInt(int lim) { + return RANDOM.nextInt(lim); + } + + private boolean nextBoolean() { + return 0 == nextInt(1); + } + + char[] getRandomText() { + + final int len = 1+nextInt(10); + char[] buffer = new char[len+1]; + for(int i=0;i=0;i--) { + if (PostingsCodec.DEBUG) + System.out.println(" TEST: term=" + field.terms[i].text2 + " has docFreq=" + field.terms[i].docs.length); + assertTrue(termsEnum.seek(field.terms[i].text2)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + } + + // Seek to non-existent empty-string term + assertFalse(termsEnum.seek("")); + + // Make sure we're now pointing to first term + assertEquals(termsEnum.text(), field.terms[0].text2); + + // Test docs enum + if (PostingsCodec.DEBUG) + System.out.println("\nTEST: docs/positions"); + termsEnum.seek(""); + upto = 0; + do { + if (nextInt(3) == 1) { + term = field.terms[upto]; + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: iterate docs..."); + DocsEnum docs = termsEnum.docs(); + int upto2 = -1; + while(upto2 < term.docs.length-1) { + // Maybe skip: + final int left = term.docs.length-upto2; + int doc; + if (nextInt(3) == 1 && left >= 1) { + int inc = 1+nextInt(left-1); + upto2 += inc; + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: skip: " + left + " docs left; skip to doc=" + term.docs[upto2] + " [" + upto2 + " of " + term.docs.length + "]"); + doc = docs.skipTo(term.docs[upto2]); + // nocommit -- test skipping to non-existent doc + assertEquals(term.docs[upto2], doc); + } else { + doc = docs.next(); + assertTrue(doc != -1); + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: got next doc..."); + upto2++; + } + assertEquals(term.docs[upto2], doc); + if (!field.omitTF) { + assertEquals(term.positions[upto2].length, docs.freq()); + if (nextInt(2) == 1) { + if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: check positions for doc " + term.docs[upto2] + "..."); + verifyPositions(term.positions[upto2], docs.positions()); + } else if (PostingsCodec.DEBUG) + System.out.println("TEST: skip positions..."); + } else if (PostingsCodec.DEBUG) + System.out.println("TEST: skip positions: omitTF=true"); + } + + assertEquals(-1, docs.next()); + + } else if (PostingsCodec.DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: skip docs"); + upto++; + + } while (termsEnum.next()); + + assertEquals(upto, field.terms.length); + + //termsEnum.close(); + } + } + } + + private void write(FieldInfos fieldInfos, Directory dir, FieldData[] fields) throws Throwable { + + // nocommit -- randomize this: + final int termIndexInterval = 16; + + SegmentWriteState state = new SegmentWriteState(null, dir, SEGMENT, fieldInfos, null, 10000, 10000, termIndexInterval, + PostingsCodecs.getDefault()); + + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + Arrays.sort(fields); + for(int i=0;i= 0); } Index: src/test/org/apache/lucene/index/TestBackwardsCompatibility.java =================================================================== --- src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 747337) +++ src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) @@ -58,11 +58,11 @@ // oldNames array. /* - public void testCreatePreLocklessCFS() throws IOException { + public void xxxtestCreatePreLocklessCFS() throws IOException { createIndex("index.cfs", true); } - public void testCreatePreLocklessNoCFS() throws IOException { + public void xxxtestCreatePreLocklessNoCFS() throws IOException { createIndex("index.nocfs", false); } */ @@ -103,13 +103,13 @@ zipFile.close(); } - public void testCreateCFS() throws IOException { + public void xxxtestCreateCFS() throws IOException { String dirName = "testindex.cfs"; createIndex(dirName, true); rmDir(dirName); } - public void testCreateNoCFS() throws IOException { + public void xxxtestCreateNoCFS() throws IOException { String dirName = "testindex.nocfs"; createIndex(dirName, true); rmDir(dirName); @@ -127,7 +127,7 @@ "23.nocfs", }; - public void testOptimizeOldIndex() throws IOException { + public void xxxtestOptimizeOldIndex() throws IOException { for(int i=0;iScorer for documents matching a Term. */ final class TermScorer extends Scorer { private Weight weight; - private TermDocs termDocs; + private DocsEnum termDocs; private byte[] norms; private float weightValue; private int doc; @@ -44,7 +44,7 @@ * @param similarity The Similarity implementation to be used for score computations. * @param norms The field norms of the document fields for the Term. */ - TermScorer(Weight weight, TermDocs td, Similarity similarity, + TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { super(similarity); this.weight = weight; @@ -80,7 +80,7 @@ if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream + //termDocs.close(); // close stream doc = Integer.MAX_VALUE; // set to sentinel value return false; } @@ -107,7 +107,7 @@ if (pointerMax != 0) { pointer = 0; } else { - termDocs.close(); // close stream + //termDocs.close(); // close stream doc = Integer.MAX_VALUE; // set to sentinel value return false; } @@ -142,16 +142,17 @@ } // not found in cache, seek underlying stream - boolean result = termDocs.skipTo(target); - if (result) { + int newDoc = termDocs.skipTo(target); + if (newDoc != -1) { pointerMax = 1; pointer = 0; - docs[pointer] = doc = termDocs.doc(); + docs[pointer] = doc = newDoc; freqs[pointer] = termDocs.freq(); + return true; } else { doc = Integer.MAX_VALUE; + return false; } - return result; } /** Returns an explanation of the score for a document. @@ -169,15 +170,12 @@ pointer++; } if (tf == 0) { - if (termDocs.skipTo(doc)) - { - if (termDocs.doc() == doc) - { - tf = termDocs.freq(); - } - } + int newDoc = termDocs.skipTo(doc); + if (newDoc == doc) { + tf = termDocs.freq(); + } } - termDocs.close(); + //termDocs.close(); tfExplanation.setValue(getSimilarity().tf(tf)); tfExplanation.setDescription("tf(termFreq("+query.getTerm()+")="+tf+")"); Index: src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsDocsConsumer { - - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ - abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; - - /** Called when we are done adding docs to this term */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/DocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class DocsConsumer { + + // nocommit + String desc; + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Adds a new doc in this term. Return null if this + * consumer doesn't need to see the positions for this + * doc. */ + abstract PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + + abstract void setField(FieldInfo fieldInfo); + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/DocsConsumer.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/SepSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/SepSkipListWriter.java (revision 0) +++ src/java/org/apache/lucene/index/SepSkipListWriter.java (revision 0) @@ -0,0 +1,173 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexOutput; + + +/** + * Implements the skip list writer for the default posting list format + * that stores positions and payloads. + * + */ +class SepSkipListWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private int[] lastSkipPayloadLength; + private long[] lastSkipDocPointer; + private long[] lastSkipFreqPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayloadPointer; + + private IndexOutput freqOutput; + private IndexOutput docOutput; + // nocommit -- private again + IndexOutput posOutput; + // nocommit -- private again + IndexOutput payloadOutput; + + private int curDoc; + private boolean curStorePayloads; + private int curPayloadLength; + private long curFreqPointer; + private long curDocPointer; + private long curPosPointer; + private long curPayloadPointer; + + SepSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, + IndexOutput freqOutput, + IndexOutput docOutput, + IndexOutput posOutput, + IndexOutput payloadOutput) { + super(skipInterval, numberOfSkipLevels, docCount); + + this.freqOutput = freqOutput; + this.docOutput = docOutput; + this.posOutput = posOutput; + this.payloadOutput = payloadOutput; + + lastSkipDoc = new int[numberOfSkipLevels]; + lastSkipPayloadLength = new int[numberOfSkipLevels]; + lastSkipFreqPointer = new long[numberOfSkipLevels]; + lastSkipDocPointer = new long[numberOfSkipLevels]; + lastSkipPosPointer = new long[numberOfSkipLevels]; + lastSkipPayloadPointer = new long[numberOfSkipLevels]; + } + + void setFreqOutput(IndexOutput freqOutput) { + this.freqOutput = freqOutput; + } + + void setDocOutput(IndexOutput docOutput) { + this.docOutput = docOutput; + } + + void setPosOutput(IndexOutput posOutput) { + this.posOutput = posOutput; + } + + void setPayloadOutput(IndexOutput payloadOutput) { + this.payloadOutput = payloadOutput; + } + + /** + * Sets the values for the current skip data. + */ + void setSkipData(int doc, boolean storePayloads, int payloadLength) { + this.curDoc = doc; + this.curStorePayloads = storePayloads; + this.curPayloadLength = payloadLength; + this.curFreqPointer = freqOutput.getFilePointer(); + this.curDocPointer = docOutput.getFilePointer(); + if (posOutput != null) + this.curPosPointer = posOutput.getFilePointer(); + if (payloadOutput != null) + this.curPayloadPointer = payloadOutput.getFilePointer(); + } + + protected void resetSkip() { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list + Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); + Arrays.fill(lastSkipDocPointer, docOutput.getFilePointer()); + if (posOutput != null) + Arrays.fill(lastSkipPosPointer, posOutput.getFilePointer()); + if (payloadOutput != null) + Arrays.fill(lastSkipPayloadPointer, payloadOutput.getFilePointer()); + + if (PostingsCodec.DEBUG) + System.out.println(" skip writer base freqFP=" + freqOutput.getFilePointer() + " posFP=" + posOutput.getFilePointer()); + } + + protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { + // To efficiently store payloads in the posting lists we do not store the length of + // every payload. Instead we omit the length for a payload if the previous payload had + // the same length. + // However, in order to support skipping the payload length at every skip point must be known. + // So we use the same length encoding that we use for the posting lists for the skip data as well: + // Case 1: current field does not store payloads + // SkipDatum --> DocSkip, FreqSkip, ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // DocSkip records the document number before every SkipInterval th document in TermFreqs. + // Document numbers are represented as differences from the previous value in the sequence. + // Case 2: current field stores payloads + // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip + // DocSkip,FreqSkip,ProxSkip --> VInt + // PayloadLength --> VInt + // In this case DocSkip/2 is the difference between + // the current and the previous value. If DocSkip + // is odd, then a PayloadLength encoded as VInt follows, + // if DocSkip is even, then it is assumed that the + // current payload length equals the length at the previous + // skip point + //System.out.println(" skip writer level=" + level + " curDoc=" + curDoc + " lastDoc=" + lastSkipDoc[level] + " delta=" + (curDoc - lastSkipDoc[level]) + " storePayloads=" + curStorePayloads + " skipBufferFP=" + skipBuffer.getFilePointer()); + if (curStorePayloads) { + int delta = curDoc - lastSkipDoc[level]; + if (curPayloadLength == lastSkipPayloadLength[level]) { + // the current payload length equals the length at the previous skip point, + // so we don't store the length again + skipBuffer.writeVInt(delta << 1); + } else { + // the payload length is different from the previous one. We shift the DocSkip, + // set the lowest bit and store the current payload length as VInt. + skipBuffer.writeVInt(delta << 1 | 1); + skipBuffer.writeVInt(curPayloadLength); + lastSkipPayloadLength[level] = curPayloadLength; + } + } else { + // current field does not store payloads + skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); + } + + // nocommit -- if payloads / pos not stored for this + // field, don't encode these 0's + skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); + skipBuffer.writeVInt((int) (curDocPointer - lastSkipDocPointer[level])); + skipBuffer.writeVInt((int) (curPosPointer - lastSkipPosPointer[level])); + skipBuffer.writeVInt((int) (curPayloadPointer - lastSkipPayloadPointer[level])); + + lastSkipDoc[level] = curDoc; + lastSkipFreqPointer[level] = curFreqPointer; + lastSkipDocPointer[level] = curDocPointer; + lastSkipPosPointer[level] = curPosPointer; + lastSkipPayloadPointer[level] = curPayloadPointer; + } +} Property changes on: src/java/org/apache/lucene/index/SepSkipListWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) @@ -0,0 +1,427 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatPostingsDocsReader extends FormatPostingsTermsDictDocsReader { + + final IndexInput freqIn; + IndexInput termsIn; + + private final FormatPostingsPositionsReader posReader; + + int skipInterval; + int maxSkipLevels; + + FormatPostingsDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION), readBufferSize); + + boolean success = false; + try { + if (segmentInfo.getHasProx()) + posReader = new FormatPostingsPositionsReader(dir, segmentInfo, readBufferSize); + else + posReader = null; + success = true; + } finally { + if (!success) + freqIn.close(); + } + } + + static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION)); + FormatPostingsPositionsReader.files(segmentInfo, files); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // Make sure we are talking to the matching past writer + PostingsCodec.checkHeader(termsIn, FormatPostingsDocsWriter.CODEC, FormatPostingsDocsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + if (posReader != null) + posReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + + final FormatPostingsPositionsReader.TermsDictReader posReader2; + if (posReader != null && !fieldInfo.omitTf) + posReader2 = (FormatPostingsPositionsReader.TermsDictReader) posReader.reader(fieldInfo, termsIn); + else + posReader2 = null; + + return new TermsDictReader(fieldInfo, posReader2, termsIn); + } + + void close() throws IOException { + try { + freqIn.close(); + } finally { + if (posReader != null) + posReader.close(); + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long freqOffset; + long skipOffset; + int docFreq; + + // TODO: abstraction violation (we are storing this with + // the concrete impl. as the type, not the abstract base + // class) + final FormatPostingsPositionsReader.TermsDictReader posReader; + private SegmentDocsEnum docs; + + TermsDictReader(FieldInfo fieldInfo, FormatPostingsPositionsReader.TermsDictReader posReader, IndexInput termsIn) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.posReader = posReader; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + + this.docFreq = docFreq; + if (PostingsCodec.DEBUG) + System.out.println(" dr.readTerm termsInPointer=" + termsIn.getFilePointer() + " df=" + docFreq + " isIndex=" + isIndexTerm); + + if (isIndexTerm) + freqOffset = termsIn.readVLong(); + else + freqOffset += termsIn.readVLong(); + + if (PostingsCodec.DEBUG) + System.out.println(" freqOffset=" + freqOffset + " vs len=" + freqIn.length()); + + if (docFreq >= skipInterval) + skipOffset = termsIn.readVLong(); + else + skipOffset = 0; + + if (posReader != null) + posReader.readTerm(docFreq, isIndexTerm); + } + + public void close() throws IOException { + if (posReader != null) + posReader.close(); + } + + DocsEnum docs(BitVector deletedDocs) throws IOException { + + if (docs == null) + // Lazy init + docs = new SegmentDocsEnum(); + + docs.init(deletedDocs); + + return docs; + } + + class SegmentDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long skipStart; + long freqStart; + final IndexInput freqIn; + // nocommit -- should we do omitTF with 2 different enum classes? + final boolean omitTF; + private BitVector deletedDocs; + + // nocommit -- should we do hasProx with 2 different enum classes? + + boolean skipped; + DefaultSkipListReader skipper; + + // TODO: abstraction violation: we are storing the + // concrete impl, not the abstract base class + FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum positions; + + SegmentDocsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new docs enum"); + this.freqIn = (IndexInput) FormatPostingsDocsReader.this.freqIn.clone(); + omitTF = fieldInfo.omitTf; + if (omitTF) + freq = 1; + } + + public void close() { + } + + void init(BitVector deletedDocs) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("[" + desc + "] dr.init freqIn seek " + freqOffset + " this=" + this + " (in=" + freqIn + "; this=" + this + ")"); + this.deletedDocs = deletedDocs; + freqIn.seek(freqOffset); + this.docFreq = TermsDictReader.this.docFreq; + count = 0; + doc = 0; + skipped = false; + skipStart = freqStart + skipOffset; + proxSkipFreq = 0; + + // maybe not necessary? + proxSkipPayloadLength = -1; + + // nocommit: abstraction violation + if (posReader != null) { + proxOffset = posReader.proxOffset; + } + + if (positions != null) + positions.payloadLength = -1; + } + + public int next() throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " freq pointer=" + freqIn.getFilePointer() + " (in=" + freqIn + "; this=" + this + ") + has del docs=" + (deletedDocs != null) ); + + // new Throwable().printStackTrace(System.out); + + while(true) { + if (count == docFreq) + return -1; + + count++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (PostingsCodec.DEBUG) + System.out.println(" read code=" + code); + if (omitTF) + doc += code; + else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) + break; + else if (PostingsCodec.DEBUG) + System.out.println(" doc=" + doc + " is deleted"); + } + + // nocommit + if (PostingsCodec.DEBUG && positions != null) + positions.desc = desc + ":" + doc; + + if (PostingsCodec.DEBUG) + System.out.println(" result doc=" + doc); + return doc; + } + + public int read(int[] docs, int[] freqs) throws IOException { + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + freq = 1; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + public int doc() { + return doc; + } + + public int ord() { + assert count > 0; + return count-1; + } + + public int freq() { + return freq; + } + + long proxOffset; + int proxSkipPayloadLength = -1; + int proxSkipFreq; + PositionsEnum fakePositions; + + public PositionsEnum positions() throws IOException { + if (positions == null) { + // Lazy init + if (posReader == null) { + // TermFreq was omitted from this field during + // indexing, which means we pretend termFreq is + // always 1 with that 1 occurrence having + // position 0 + if (fakePositions == null) + fakePositions = new FormatPostingsFakePositionsEnum(); + return fakePositions; + } else { + // TODO: abstraction violation + positions = (FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum) posReader.positions(); + if (PostingsCodec.DEBUG) + System.out.println("pos skip proxOffset=" + proxOffset + " payloadlen=" + proxSkipPayloadLength + " skipPosCount= " + proxSkipFreq); + positions.skip(proxOffset, proxSkipPayloadLength, proxSkipFreq); + } + } + + if (PostingsCodec.DEBUG) + positions.desc = desc + ":" + doc; + + positions.catchUp(freq); + + return positions; + } + + public int skipTo(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skip to target=" + target); + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + if (skipper == null) + // Lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + + if (!skipped) { + + // We haven't already skipped for this posting, + // so now we init the skipper + + // TODO: this is abstraction violation; instead, + // skipper should interact with this as a + // private consumer + skipper.init(freqOffset+skipStart, + freqOffset, proxOffset, + docFreq, fieldInfo.storePayloads); + + if (PostingsCodec.DEBUG) + System.out.println(" skip reader base freqFP=" + (freqOffset+skipStart) + " freqFP=" + freqOffset + " proxFP=" + proxOffset); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skipper moved to newCount=" + newCount + " freqFP=" + skipper.getFreqPointer() + " proxFP=" + skipper.getProxPointer() + " doc=" + skipper.getDoc()); + + // Skipper did move + freqIn.seek(skipper.getFreqPointer()); + count = newCount; + doc = skipper.getDoc(); + + // TODO: abstraction violation; this should be a + // private interaction b/w skipper & posReader + if (positions != null) + // nocommit -- should that be count? + positions.skip(skipper.getProxPointer(), skipper.getPayloadLength(), 0); + else { + proxOffset = skipper.getProxPointer(); + proxSkipPayloadLength = skipper.getPayloadLength(); + // nocommit -- should that be count? + proxSkipFreq = 0; + } + } else if (PostingsCodec.DEBUG) + System.out.println(" no skipping to be done"); + } else if (PostingsCodec.DEBUG) + System.out.println(" no skip data (#docs is too low)"); + + // Now, linear scan for the rest: + do { + if (next() == -1) + return -1; + } while (target > doc); + + return doc; + } + } + } +} + +/** Returned when someone asks for positions() enum on field + * with omitTf true */ +class FormatPostingsFakePositionsEnum extends PositionsEnum { + int next() { + return 0; + } + int getPayloadLength() { + return 0; + } + boolean hasPayload() { + return false; + } + byte[] getPayload(byte[] data, int offset) { + return null; + } +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/DirectoryIndexReader.java =================================================================== --- src/java/org/apache/lucene/index/DirectoryIndexReader.java (revision 747337) +++ src/java/org/apache/lucene/index/DirectoryIndexReader.java (working copy) @@ -53,14 +53,18 @@ private SegmentInfos rollbackSegmentInfos; protected boolean readOnly; - + protected PostingsCodecs codecs; - void init(Directory directory, SegmentInfos segmentInfos, boolean closeDirectory, boolean readOnly) + void init(Directory directory, SegmentInfos segmentInfos, boolean closeDirectory, boolean readOnly, PostingsCodecs codecs) throws IOException { this.directory = directory; this.segmentInfos = segmentInfos; this.closeDirectory = closeDirectory; this.readOnly = readOnly; + if (codecs == null) + this.codecs = PostingsCodecs.getDefault(); + else + this.codecs = codecs; if (readOnly) { assert this instanceof ReadOnlySegmentReader || @@ -84,35 +88,48 @@ protected DirectoryIndexReader() {} DirectoryIndexReader(Directory directory, SegmentInfos segmentInfos, - boolean closeDirectory, boolean readOnly) throws IOException { + boolean closeDirectory, boolean readOnly, PostingsCodecs codes) throws IOException { super(); - init(directory, segmentInfos, closeDirectory, readOnly); + init(directory, segmentInfos, closeDirectory, readOnly, codecs); } - static DirectoryIndexReader open(final Directory directory, final boolean closeDirectory, final IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException { - return open(directory, closeDirectory, deletionPolicy, null, false); + static DirectoryIndexReader open(final Directory directory, final boolean closeDirectory, + final IndexDeletionPolicy deletionPolicy, + PostingsCodecs postingsCodecs) + throws CorruptIndexException, IOException { + return (DirectoryIndexReader) open(directory, closeDirectory, deletionPolicy, null, false, postingsCodecs); } - static DirectoryIndexReader open(final Directory directory, final boolean closeDirectory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly) throws CorruptIndexException, IOException { + static IndexReader open(final Directory directory, final boolean closeDirectory, + final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, + final boolean readOnly, PostingsCodecs codecs) + throws CorruptIndexException, IOException { + final PostingsCodecs codecs2; + if (codecs == null) + codecs2 = PostingsCodecs.getDefault(); + else + codecs2 = codecs; + SegmentInfos.FindSegmentsFile finder = new SegmentInfos.FindSegmentsFile(directory) { protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs2); DirectoryIndexReader reader; if (infos.size() == 1) { // index is optimized reader = SegmentReader.get(readOnly, infos, infos.info(0), false); } else if (readOnly) { - reader = new ReadOnlyMultiSegmentReader(directory, infos, false); + reader = new ReadOnlyMultiSegmentReader(directory, infos, false, codecs2); } else { - reader = new MultiSegmentReader(directory, infos, false, false); + reader = new MultiSegmentReader(directory, infos, false, false, codecs2); } reader.setDeletionPolicy(deletionPolicy); reader.closeDirectory = closeDirectory; + reader.codecs = codecs2; return reader; } }; @@ -176,7 +193,7 @@ DirectoryIndexReader newReader = doReopen(clonedInfos, true, openReadOnly); if (this != newReader) { - newReader.init(directory, clonedInfos, closeDirectory, openReadOnly); + newReader.init(directory, clonedInfos, closeDirectory, openReadOnly, codecs); newReader.deletionPolicy = deletionPolicy; } @@ -224,11 +241,11 @@ protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); - infos.read(directory, segmentFileName); + infos.read(directory, segmentFileName, codecs); DirectoryIndexReader newReader = doReopen(infos, false, openReadOnly); if (DirectoryIndexReader.this != newReader) { - newReader.init(directory, infos, closeDirectory, openReadOnly); + newReader.init(directory, infos, closeDirectory, openReadOnly, codecs); newReader.deletionPolicy = deletionPolicy; } @@ -310,7 +327,7 @@ */ public boolean isCurrent() throws CorruptIndexException, IOException { ensureOpen(); - return SegmentInfos.readCurrentVersion(directory) == segmentInfos.getVersion(); + return SegmentInfos.readCurrentVersion(directory, codecs) == segmentInfos.getVersion(); } /** @@ -326,7 +343,7 @@ if(closeDirectory) directory.close(); } - + /** * Commit changes resulting from delete, undeleteAll, or * setNorm operations @@ -344,7 +361,7 @@ // KeepOnlyLastCommitDeleter: IndexFileDeleter deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, null, null); + segmentInfos, null, null, codecs); // Checkpoint the state we are about to change, in // case we have to roll back: @@ -416,14 +433,12 @@ * @throws IOException if there is a low-level IO error */ protected void acquireWriteLock() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { - if (readOnly) { // NOTE: we should not reach this code w/ the core // IndexReader classes; however, an external subclass // of IndexReader could reach this. ReadOnlySegmentReader.noWrite(); } - if (segmentInfos != null) { ensureOpen(); if (stale) @@ -437,7 +452,7 @@ // we have to check whether index has changed since this reader was opened. // if so, this reader is no longer valid for deletion - if (SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) { + if (SegmentInfos.readCurrentVersion(directory, codecs) > segmentInfos.getVersion()) { stale = true; this.writeLock.release(); this.writeLock = null; @@ -548,13 +563,17 @@ /** @see IndexReader#listCommits */ public static Collection listCommits(Directory dir) throws IOException { + return listCommits(dir, PostingsCodecs.getDefault()); + } + static Collection listCommits(Directory dir, PostingsCodecs codecs) throws IOException { + final String[] files = dir.listAll(); Collection commits = new ArrayList(); SegmentInfos latest = new SegmentInfos(); - latest.read(dir); + latest.read(dir, codecs); final long currentGen = latest.getGeneration(); commits.add(new ReaderCommit(latest, dir)); @@ -571,7 +590,7 @@ try { // IOException allowed to throw there, in case // segments_N is corrupt - sis.read(dir, fileName); + sis.read(dir, fileName, codecs); } catch (FileNotFoundException fnfe) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth Index: src/java/org/apache/lucene/index/LegacyFieldsEnum.java =================================================================== --- src/java/org/apache/lucene/index/LegacyFieldsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/LegacyFieldsEnum.java (revision 0) @@ -0,0 +1,192 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.util.BitVector; + +/** Implements new API (FieldsEnum/TermsEnum) on top of old + * API. Used only for IndexReader impls outside Lucene's + * core. */ +class LegacyFieldsEnum extends FieldsEnum { + private final IndexReader r; + private TermEnum terms; + private String field; + + public LegacyFieldsEnum(IndexReader r) throws IOException { + this.r = r; + terms = r.terms(); + } + + private void doSeek(Term t) throws IOException { + terms.close(); + terms = r.terms(t); + } + + public boolean seek(String field) throws IOException { + this.field = field; + doSeek(new Term(field, "")); + return terms.term() != null && terms.term().field.equals(field); + } + + public boolean next() throws IOException { + + final Term seekTo = new Term(field, "\uFFFF"); + + doSeek(seekTo); + if (terms.term() != null) { + String newField = terms.term().field; + assert !newField.equals(field); + field = newField; + return true; + } else + return false; + } + + public TermsEnum terms() throws IOException { + return new LegacyTermsEnum(); + } + + public String field() { + return field; + } + + public void close() throws IOException { + terms.close(); + } + + private class LegacyTermsEnum extends TermsEnum { + + private TermEnum terms; + + LegacyTermsEnum() throws IOException { + this.terms = r.terms(new Term(field, "")); + } + + public boolean seek(String text) throws IOException { + terms.close(); + terms = r.terms(new Term(field, text)); + // assert terms.term().field.equals(field); + return terms.term() != null && terms.term().text.equals(text); + } + + public boolean next() throws IOException { + return terms.next(); + } + + public String text() { + return terms.term().text; + } + + public int docFreq() { + return terms.docFreq(); + } + + public long ord() { + throw new UnsupportedOperationException(); + } + + public DocsEnum docs() throws IOException { + return new LegacyDocsEnum(terms.term()); + } + + public void close() throws IOException { + terms.close(); + } + } + + private class LegacyDocsEnum extends DocsEnum { + final TermDocs td; + final Term term; + + TermPositions tp; + + LegacyDocsEnum(Term term) throws IOException { + this.term = term; + td = r.termDocs(term); + } + + public int next() throws IOException { + if (td.next()) { + return td.doc(); + } else { + return -1; + } + } + + public int skipTo(int target) throws IOException { + if (td.skipTo(target)) { + return td.doc(); + } else { + return -1; + } + } + + public int freq() { + return td.freq(); + } + + public int ord() { + throw new UnsupportedOperationException(); + } + + public int read(int[] docs, int[] freqs) throws IOException { + return td.read(docs, freqs); + } + + public void close() throws IOException { + td.close(); + } + + LegacyPositionsEnum lpe; + + public PositionsEnum positions() throws IOException { + if (tp == null) { + tp = r.termPositions(term); + lpe = new LegacyPositionsEnum(tp); + } else { + tp.seek(term); + } + return lpe; + } + } + + private class LegacyPositionsEnum extends PositionsEnum { + + final TermPositions tp; + LegacyPositionsEnum(TermPositions tp) { + this.tp = tp; + } + + public int next() throws IOException { + return tp.nextPosition(); + } + + public int getPayloadLength() { + return tp.getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return tp.getPayload(data, offset); + } + + public boolean hasPayload() { + return tp.isPayloadAvailable(); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/LegacyFieldsEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (working copy) @@ -25,36 +25,68 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.store.IndexOutput; -final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer { +final class FormatPostingsDocsWriter extends DocsConsumer { + final static String CODEC = "SingleFileDocFreqSkip"; + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + final IndexOutput out; - final FormatPostingsTermsWriter parent; final FormatPostingsPositionsWriter posWriter; final DefaultSkipListWriter skipListWriter; final int skipInterval; + final int maxSkipLevels; final int totalNumDocs; + IndexOutput termsOut; boolean omitTF; boolean storePayloads; + // Starts a new term + long lastFreqStart; long freqStart; FieldInfo fieldInfo; - FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException { + FormatPostingsDocsWriter(SegmentWriteState state) throws IOException { super(); - this.parent = parent; - final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); + final String fileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.FREQ_EXTENSION); state.flushedFiles.add(fileName); - out = parent.parent.dir.createOutput(fileName); - totalNumDocs = parent.parent.totalNumDocs; + out = state.directory.createOutput(fileName); + totalNumDocs = state.numDocs; - // TODO: abstraction violation - skipInterval = parent.parent.termsOut.skipInterval; - skipListWriter = parent.parent.skipListWriter; - skipListWriter.setFreqOutput(out); + // nocommit -- abstraction violation + skipListWriter = new DefaultSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + out, + null); + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + posWriter = new FormatPostingsPositionsWriter(state, this); } + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + PostingsCodec.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + posWriter.start(termsOut); + } + + void startTerm() { + freqStart = out.getFilePointer(); + if (!omitTF) + posWriter.startTerm(); + skipListWriter.resetSkip(); + } + + // nocommit -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTF = fieldInfo.omitTf; @@ -65,11 +97,15 @@ int lastDocID; int df; + int count; + /** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ - FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; + if (PostingsCodec.DEBUG) + System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq + " freqPointer=" + out.getFilePointer()); if (docID < 0 || (df > 0 && delta <= 0)) throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); @@ -78,8 +114,12 @@ // TODO: abstraction violation skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); skipListWriter.bufferSkip(df); + if (PostingsCodec.DEBUG) + System.out.println(" bufferSkip lastDocID=" + lastDocID + " df=" + df + " freqFP=" + out.getFilePointer() + " proxFP=" + skipListWriter.proxOutput.getFilePointer()); } + // nocommit -- move this assert up above; every consumer + // shouldn't have to check for this bug: assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; lastDocID = docID; @@ -92,36 +132,56 @@ out.writeVInt(termDocFreq); } - return posWriter; + // nocommit + if (PostingsCodec.DEBUG) + ((FormatPostingsPositionsWriter) posWriter).desc = desc + ":" + docID; + + if (omitTF) + return null; + else + return posWriter; } - private final TermInfo termInfo = new TermInfo(); // minimize consing - final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); - /** Called when we are done adding docs to this term */ - void finish() throws IOException { - long skipPointer = skipListWriter.writeSkip(out); + void finishTerm(int docCount, boolean isIndexTerm) throws IOException { - // TODO: this is abstraction violation -- we should not - // peek up into parents terms encoding format - termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); + // nocommit -- wasteful we are counting this in two places? + assert docCount == df; + if (PostingsCodec.DEBUG) + System.out.println("dw.finishTerm termsOut pointer=" + termsOut.getFilePointer() + " freqStart=" + freqStart + " df=" + df); - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); + if (isIndexTerm) + // Write absolute at seek points + termsOut.writeVLong(freqStart); + else + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); - if (df > 0) { - parent.termsOut.add(fieldInfo.number, - utf8.result, - utf8.length, - termInfo); + lastFreqStart = freqStart; + + if (df >= skipInterval) { + if (PostingsCodec.DEBUG) + System.out.println(" writeSkip @ freqFP=" + out.getFilePointer() + " freqStartFP=" + freqStart); + termsOut.writeVLong(skipListWriter.writeSkip(out)-freqStart); } + if (!omitTF) + posWriter.finishTerm(isIndexTerm); + lastDocID = 0; df = 0; + + // nocommit + count = 0; } void close() throws IOException { - out.close(); - posWriter.close(); + if (PostingsCodec.DEBUG) + System.out.println("docs writer close pointer=" + out.getFilePointer()); + try { + out.close(); + } finally { + posWriter.close(); + } } } Index: src/java/org/apache/lucene/index/IndexFileNames.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileNames.java (revision 747337) +++ src/java/org/apache/lucene/index/IndexFileNames.java (working copy) @@ -109,6 +109,10 @@ GEN_EXTENSION, NORMS_EXTENSION, COMPOUND_FILE_STORE_EXTENSION, + // nocommit -- need cleaner way! + "doc", + "pyl", + "skp" }; /** File extensions that are added to a compound file @@ -154,6 +158,12 @@ TERMS_INDEX_EXTENSION, TERMS_EXTENSION }; + + static final String COMPOUND_EXTENSIONS_NOT_CODEC[] = new String[] { + FIELD_INFOS_EXTENSION, + FIELDS_INDEX_EXTENSION, + FIELDS_EXTENSION, + }; /** File extensions for term vector support */ static final String VECTOR_EXTENSIONS[] = new String[] { Index: src/java/org/apache/lucene/index/FieldsEnum.java =================================================================== --- src/java/org/apache/lucene/index/FieldsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsEnum.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.AttributeSource; + +/** Enumerates indexed fields. + * + * NOTE: this API is experimental and will likely change */ + +public abstract class FieldsEnum extends AttributeSource { + + /** Seeks to the specified field. Returns true if the field + * exists. */ + public abstract boolean seek(String field) throws IOException; + + // nocommit -- why not return String/null? + /** Increments the enumeration to the next field. True if one exists.*/ + public abstract boolean next() throws IOException; + + /** Returns the current Field.*/ + public abstract String field(); + + /** Get TermsEnum for the current field. You should not + * call {@link #next()} or {@link #seek()} until you're + * done using this TermsEnum. */ + public abstract TermsEnum terms() throws IOException; + + // nocommit -- maybe no close method? + /** Closes the enumeration to further activity, freeing resources. */ + //public abstract void close() throws IOException; +} + Property changes on: src/java/org/apache/lucene/index/FieldsEnum.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentTermPositions.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermPositions.java (revision 747337) +++ src/java/org/apache/lucene/index/SegmentTermPositions.java (working copy) @@ -18,12 +18,14 @@ */ import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitVector; import java.io.IOException; final class SegmentTermPositions extends SegmentTermDocs implements TermPositions { private IndexInput proxStream; + private IndexInput proxStreamOrig; private int proxCount; private int position; @@ -38,9 +40,9 @@ private long lazySkipPointer = -1; private int lazySkipProxCount = 0; - SegmentTermPositions(SegmentReader p) { - super(p); - this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time + SegmentTermPositions(IndexInput freqStream, IndexInput proxStream, BitVector deletedDocs, TermInfosReader tis, FieldInfos fieldInfos) { + super(freqStream, deletedDocs, tis, fieldInfos); + this.proxStreamOrig = proxStream; // the proxStream will be cloned lazily when nextPosition() is called for the first time } final void seek(TermInfo ti, Term term) throws IOException { @@ -146,7 +148,7 @@ private void lazySkip() throws IOException { if (proxStream == null) { // clone lazily - proxStream = (IndexInput)parent.proxStream.clone(); + proxStream = (IndexInput)proxStreamOrig.clone(); } // we might have to skip the current payload Index: src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { - - final Directory dir; - final String segment; - final TermInfosWriter termsOut; - final FieldInfos fieldInfos; - final FormatPostingsTermsWriter termsWriter; - final DefaultSkipListWriter skipListWriter; - final int totalNumDocs; - - public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { - super(); - - dir = state.directory; - segment = state.segmentName; - totalNumDocs = state.numDocs; - this.fieldInfos = fieldInfos; - termsOut = new TermInfosWriter(dir, - segment, - fieldInfos, - state.termIndexInterval); - - // TODO: this is a nasty abstraction violation (that we - // peek down to find freqOut/proxOut) -- we need a - // better abstraction here whereby these child consumers - // can provide skip data or not - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - totalNumDocs, - null, - null); - - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); - - termsWriter = new FormatPostingsTermsWriter(state, this); - } - - /** Add a new field */ - FormatPostingsTermsConsumer addField(FieldInfo field) { - termsWriter.setField(field); - return termsWriter; - } - - /** Called when we are done adding everything. */ - void finish() throws IOException { - termsOut.close(); - termsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.util.ArrayUtil; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsTermsConsumer { - - /** Adds a new term in this field; term ends with U+FFFF - * char */ - abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException; - - char[] termBuffer; - FormatPostingsDocsConsumer addTerm(String text) throws IOException { - final int len = text.length(); - if (termBuffer == null || termBuffer.length < 1+len) - termBuffer = new char[ArrayUtil.getNextSize(1+len)]; - text.getChars(0, len, termBuffer, 0); - termBuffer[len] = 0xffff; - return addTerm(termBuffer, 0); - } - - /** Called when we are done adding terms to this field */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/TermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsConsumer { + + /** Starts a new term in this field; term ends with U+FFFF + * char */ + abstract DocsConsumer startTerm(char[] text, int start) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(char[] text, int start, int numDocs) throws IOException; + + /** Called when we are done adding terms to this field */ + abstract void finish() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/TermsConsumer.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) @@ -0,0 +1,234 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.util.BitVector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; + +// nocommit -- base class should not be named terms dict: +// this class interacts w/ a docsreader +class FormatPostingsPositionsReader extends FormatPostingsTermsDictPositionsReader { + + final IndexInput proxIn; + IndexInput termsIn; + + FormatPostingsPositionsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + assert segmentInfo.getHasProx(); + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION), readBufferSize); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + PostingsCodec.checkHeader(termsIn, FormatPostingsPositionsWriter.CODEC, FormatPostingsPositionsWriter.VERSION_START); + } + + static void files(SegmentInfo segmentInfo, Collection files) { + if (segmentInfo.getHasProx()) + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION)); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + return new TermsDictReader(termsIn, fieldInfo); + } + + void close() throws IOException { + if (proxIn != null) + proxIn.close(); + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long proxOffset; + + TermsDictReader(IndexInput termsIn, FieldInfo fieldInfo) { + this.termsIn = termsIn; + this.fieldInfo = fieldInfo; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println(" pr.readterm termsInPointer=" + termsIn.getFilePointer() + " isIndex=" + isIndexTerm); + if (isIndexTerm) + proxOffset = termsIn.readVLong(); + else + proxOffset += termsIn.readVLong(); + if (PostingsCodec.DEBUG) + System.out.println(" proxOffset=" + proxOffset); + if (positions != null) { + positions.seekPending = true; + positions.skipOffset = proxOffset; + positions.skipPosCount = 0; + } + } + + void close() throws IOException { + } + + SegmentPositionsEnum positions; + + PositionsEnum positions() throws IOException { + + if (positions == null) + // Lazy init + positions = new SegmentPositionsEnum(); + + return positions; + } + + // nocommit -- should we have different reader for + // payload vs no payload? + class SegmentPositionsEnum extends PositionsEnum { + + // nocommit + String desc; + + final IndexInput proxIn; + + final boolean storePayloads; + + boolean seekPending; // True if we must seek before reading next position + boolean payloadPending; // True if we must skip payload beore reading next position + + long skipOffset; + int skipPosCount; + + int position; + int payloadLength; + + SegmentPositionsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new pos enum"); + proxIn = (IndexInput) FormatPostingsPositionsReader.this.proxIn.clone(); + storePayloads = fieldInfo.storePayloads; + } + + void skip(long proxOffset, int lastPayloadLength, int numPositions) { + skipOffset = proxOffset; + payloadLength = lastPayloadLength; + assert payloadLength >= 0 || payloadLength == -1; + skipPosCount = numPositions; + seekPending = true; + payloadPending = false; + if (PostingsCodec.DEBUG) + System.out.println("pr [" + desc + "] skip fp= " + proxOffset + " numPositions=" + numPositions); + } + + void skip(int numPositions) { + skipPosCount += numPositions; + if (PostingsCodec.DEBUG) + System.out.println("pr [" + desc + "] skip " + numPositions + " positions; now " + skipPosCount); + } + + void catchUp(int currentCount) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println(" pos catchup: seekPending=" + seekPending + " skipOffset=" + skipOffset + " skipPosCount " + skipPosCount + " vs currentCount " + currentCount + " payloadLen=" + payloadLength); + if (seekPending) { + proxIn.seek(skipOffset); + seekPending = false; + } + + while(skipPosCount > currentCount) + next(); + if (PostingsCodec.DEBUG) + System.out.println(" pos catchup done"); + positions.init(); + } + + void init() { + if (PostingsCodec.DEBUG) + System.out.println(" pos init"); + position = 0; + } + + int next() throws IOException { + + if (PostingsCodec.DEBUG) + System.out.println(" pr.next [" + desc + "]: fp=" + proxIn.getFilePointer() + " return pos=" + position); + + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + if (PostingsCodec.DEBUG) + System.out.println(" payload pending: skip " + payloadLength + " bytes"); + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + if (PostingsCodec.DEBUG) + System.out.println(" new payloadLen=" + payloadLength); + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else + position += proxIn.readVInt(); + + skipPosCount--; + + // NOTE: the old API actually allowed this... + assert skipPosCount >= 0: "next() was called too many times (more than FormatPostingsDocsEnum.freq() times)"; + + if (PostingsCodec.DEBUG) + System.out.println(" proxFP=" + proxIn.getFilePointer() + " return pos=" + position); + return position; + } + + int getPayloadLength() { + return payloadLength; + } + + byte[] getPayload(byte[] data, int offset) throws IOException { + + if (!payloadPending) + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + + final byte[] retArray; + final int retOffset; + if (data == null || data.length-offset < payloadLength) { + // the array is too small to store the payload data, + // so we allocate a new one + retArray = new byte[payloadLength]; + retOffset = 0; + } else { + retArray = data; + retOffset = offset; + } + + proxIn.readBytes(retArray, retOffset, payloadLength); + payloadPending = false; + return retArray; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } + } +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 747337) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -78,8 +78,10 @@ // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTf==false + private boolean flexPostings; // True if postings were written with new flex format + private PostingsCodec codec; - public SegmentInfo(String name, int docCount, Directory dir) { + public SegmentInfo(String name, int docCount, Directory dir, PostingsCodec codec) { this.name = name; this.docCount = docCount; this.dir = dir; @@ -92,15 +94,20 @@ docStoreIsCompoundFile = false; delCount = 0; hasProx = true; + flexPostings = true; + this.codec = codec; } + /* public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false, true); } + */ public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, - int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx) { - this(name, docCount, dir); + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, + PostingsCodec codec) { + this(name, docCount, dir, codec); this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); this.hasSingleNormFile = hasSingleNormFile; preLockless = false; @@ -108,6 +115,7 @@ this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; + this.codec = codec; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } @@ -133,6 +141,7 @@ isCompoundFile = src.isCompoundFile; hasSingleNormFile = src.hasSingleNormFile; delCount = src.delCount; + codec = src.codec; } /** @@ -143,10 +152,11 @@ * @param format format of the segments info file * @param input input handle to read segment info from */ - SegmentInfo(Directory dir, int format, IndexInput input) throws IOException { + SegmentInfo(Directory dir, int format, IndexInput input, PostingsCodecs codecs) throws IOException { this.dir = dir; name = input.readString(); docCount = input.readInt(); + final String codecName; if (format <= SegmentInfos.FORMAT_LOCKLESS) { delGen = input.readLong(); if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) { @@ -188,6 +198,11 @@ hasProx = input.readByte() == 1; else hasProx = true; + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + codecName = input.readString(); + else + codecName = "PreFlex"; } else { delGen = CHECK_DIR; normGen = null; @@ -199,7 +214,9 @@ docStoreSegment = null; delCount = -1; hasProx = true; + codecName = "PreFlex"; } + codec = codecs.lookup(codecName); } void setNumFields(int numFields) { @@ -282,7 +299,7 @@ } public Object clone () { - SegmentInfo si = new SegmentInfo(name, docCount, dir); + SegmentInfo si = new SegmentInfo(name, docCount, dir, codec); si.isCompoundFile = isCompoundFile; si.delGen = delGen; si.delCount = delCount; @@ -294,6 +311,8 @@ si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; + si.hasProx = hasProx; + si.codec = codec; return si; } @@ -517,6 +536,7 @@ output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); + output.writeString(codec.name); } void setHasProx(boolean hasProx) { @@ -528,6 +548,15 @@ return hasProx; } + public void setCodec(PostingsCodec codec) { + assert this.codec == null; + this.codec = codec; + } + + PostingsCodec getCodec() { + return codec; + } + private void addIfExists(List files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); @@ -538,7 +567,6 @@ * returns List is a locally cached List so you should not * modify it. */ - public List files() throws IOException { if (files != null) { @@ -555,7 +583,10 @@ } else { final String[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS; for(int i=0;i FieldInfo */ + + PreFlexTermInfosReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize) + throws IOException { + tis = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize); + this.fieldInfos = fieldInfos; + + // make sure that all index files have been read or are kept open + // so that if an index update removes them we'll still have them + freqStream = dir.openInput(info.name + ".frq", readBufferSize); + boolean anyProx = false; + final int numFields = fieldInfos.size(); + for(int i=0;i 0: got " + indexDivisor); + + if (anyIndexRead) + throw new IllegalStateException("index terms are already loaded"); + + this.indexDivisor = indexDivisor; + totalIndexInterval = indexInterval * indexDivisor; + } + + /** Returns the indexDivisor. + * @see #setIndexDivisor + */ + public int getIndexDivisor() { + return indexDivisor; + } + + public FieldsEnum fields(BitVector deletedDocs) { + if (PostingsCodec.DEBUG) + System.out.println("tdr.fields(): field count=" + fields.size()); + return new Fields(deletedDocs); + } + + private class Fields extends FieldsEnum { + Iterator it; + FieldReader current; + final BitVector deletedDocs; + + Fields(BitVector deletedDocs) { + this.deletedDocs = deletedDocs; + } + + public boolean next() { + if (PostingsCodec.DEBUG) { + System.out.println("tdrf.next seg=" + segment); + new Throwable().printStackTrace(System.out); + } + if (it == null) { + if (PostingsCodec.DEBUG) + System.out.println(" init it"); + it = fields.values().iterator(); + } + if (it.hasNext()) { + current = (FieldReader) it.next(); + if (PostingsCodec.DEBUG) + System.out.println(" hasNext set current field=" + current.fieldInfo.name); + return true; + } else + return false; + } + + public String field() { + return current.fieldInfo.name; + } + + public boolean seek(String field) { + if (PostingsCodec.DEBUG) + System.out.println("tdrf.seek field=" + field + " seg=" + segment); + + it = fields.tailMap(field).values().iterator(); + if (next()) { + if (PostingsCodec.DEBUG) + System.out.println(" return " + field.equals(current.fieldInfo.name)); + return field.equals(current.fieldInfo.name); + } else + return false; + } + + public TermsEnum terms() throws IOException { + return current.terms(deletedDocs); + } + + public void close() {} + } + + private class FieldReader { + + final long numTerms; + final FieldInfo fieldInfo; + final long indexStartPointer; + final long termsStartPointer; + + // TODO: genericize "skipper" API so that we could swap + // in a multi-level skipper, here, instead of flat one: + // TODO: we could save mem here by packing our own shared char[]'s + String[] indexTerms; + long[] indexOffsets; + + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long indexStartPointer) { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + assert numTerms > 0; + this.indexStartPointer = indexStartPointer; + this.termsStartPointer = termsStartPointer; + } + + synchronized final void readIndex() throws IOException { + if (indexTerms != null) + return; + + final int indexSize = (int) (1+(numTerms-1)/totalIndexInterval); + + if (PostingsCodec.DEBUG) + System.out.println(" tdr.readIndex field=" + fieldInfo.name + " numTerms=" + numTerms + " indexSize=" + indexSize + " indexSeek=" + indexStartPointer + " segment=" + segment + " indexDivisor=" + indexDivisor); + + IndexInput in = (IndexInput) indexIn.clone(); + in.seek(indexStartPointer); + + indexTerms = new String[indexSize]; + indexOffsets = new long[indexSize]; + + if (PostingsCodec.DEBUG) + System.out.println("read index for field=" + fieldInfo.name); + + long pointer = termsStartPointer; + final DeltaBytesReader bytesReader = new DeltaBytesReader(in); + final int numIndexTerms = (int) (1+(numTerms-1)/indexInterval); + int upto = 0; + for(int i=0;i= numTerms) { + termUpto++; + return false; + } + if (PostingsCodec.DEBUG) { + System.out.println("tdr.next: field=" + fieldInfo.name + " termsInPointer=" + in.getFilePointer() + " vs len=" + in.length() + " isIndex=" + (termUpto%indexInterval==0) + " seg=" + segment); + //new Throwable().printStackTrace(System.out); + } + bytesReader.read(); + docFreq = in.readVInt(); + if (PostingsCodec.DEBUG) + System.out.println(" text=" + bytesReader.text() + " freq=" + docFreq); + docs.readTerm(docFreq, termUpto % indexInterval == 0); + termUpto++; + if (PostingsCodec.DEBUG) + System.out.println(" termUpto=" + termUpto + " vs numTerms=" + numTerms + " fp=" + in.getFilePointer()); + return true; + } + + public int docFreq() { + if (termUpto >= 1+numTerms) + return 0; + else + return docFreq; + } + + public String text() { + // nocommit -- really necessary? + if (termUpto >= 1+numTerms) + return null; + else + return bytesReader.text(); + } + + public long ord() { + return termUpto-1; + } + + public DocsEnum docs() throws IOException { + doSkip = false; + // nocommit + DocsEnum docsEnum = docs.docs(deletedDocs); + docsEnum.desc = fieldInfo.name + ":" + bytesReader.text(); + return docsEnum; + } + + public void close() throws IOException { + in.close(); + docs.close(); + } + } + } + + private static class DeltaBytesReader { + private byte[] bytes; + final UnicodeUtil.UTF16Result chars = new UnicodeUtil.UTF16Result(); + final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); + private int length; + final IndexInput in; + boolean started; + + DeltaBytesReader(IndexInput in) { + this.in = in; + bytes = new byte[10]; + } + + void reset(String text) { + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); + if (utf8.length > bytes.length) + bytes = ArrayUtil.grow(bytes, utf8.length); + System.arraycopy(utf8.result, 0, + this.bytes, 0, utf8.length); + this.length = utf8.length; + chars.copyText(text); + } + + String text() { + // nocommit -- cache this? + return new String(chars.result, 0, chars.length); + } + + int compareTo(String other) { + + final int otherLength = other.length(); + final int minLength; + if (otherLength < chars.length) + minLength = otherLength; + else + minLength = chars.length; + + for(int i=0;i otherC) + return 1; + } + + if (chars.length < otherLength) + return -1; + else if (chars.length > otherLength) + return 1; + else + return 0; + } + + void read() throws IOException { + //System.out.println("terms reader fp=" + in.getFilePointer() + " this=" + this); + final int start = in.readVInt(); + final int suffix = in.readVInt(); + //System.out.println(" start=" + start + " suffix=" + suffix); + assert start <= length: "start=" + start + " length=" + length; + + if (start + suffix > bytes.length) + bytes = ArrayUtil.grow(bytes, start+suffix); + in.readBytes(bytes, start, suffix); + length = start + suffix; + + // TODO: conversion could be incremental + UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars); + started = true; + } + } + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitVector; + +/** TermsDictReader interacts with a single instance of this + * to manage creation of multiple docs enum + * instances. */ +abstract class FormatPostingsTermsDictDocsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a docs enum for the last term read */ + abstract DocsEnum docs(BitVector deletedDocs) throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + /** Returns a new private reader for stepping through + * terms, getting DocsEnum. */ + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatSepPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepPositionsReader.java (revision 0) @@ -0,0 +1,270 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.util.BitVector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; + +// nocommit -- base class should not be named terms dict: +// this class interacts w/ a docsreader +class FormatSepPositionsReader extends FormatPostingsTermsDictPositionsReader { + + static boolean DEBUG = false; + + final IndexInput posIn; + final IndexInput payloadIn; + IndexInput termsIn; + + FormatSepPositionsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + assert segmentInfo.getHasProx(); + boolean success = false; + try { + posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION), readBufferSize); + payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "pyl"), readBufferSize); + success = true; + } finally { + if (!success) + close(); + } + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // nocomit -- move these 2 constants into XXXCodec? + PostingsCodec.checkHeader(termsIn, FormatSepPositionsWriter.CODEC, FormatSepPositionsWriter.VERSION_START); + } + + static void files(SegmentInfo segmentInfo, Collection files) { + if (segmentInfo.getHasProx()) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "pyl")); + } + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + return new TermsDictReader(termsIn, fieldInfo); + } + + void close() throws IOException { + try { + if (posIn != null) + posIn.close(); + } finally { + if (payloadIn != null) + payloadIn.close(); + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long posOffset; + long payloadOffset; + + TermsDictReader(IndexInput termsIn, FieldInfo fieldInfo) { + this.termsIn = termsIn; + this.fieldInfo = fieldInfo; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + if (DEBUG) + System.out.println(" pr.readterm termsInPointer=" + termsIn.getFilePointer() + " isIndex=" + isIndexTerm); + if (isIndexTerm) { + posOffset = termsIn.readVLong(); + payloadOffset = termsIn.readVLong(); + } else { + posOffset += termsIn.readVLong(); + payloadOffset += termsIn.readVLong(); + } + if (DEBUG) + System.out.println(" posOffset=" + posOffset + " payloadOffset=" + payloadOffset); + if (positions != null) { + positions.seekPending = true; + positions.skipPosOffset = posOffset; + positions.skipPayloadOffset = payloadOffset; + positions.skipPosCount = 0; + } + } + + void close() throws IOException {} + + SegmentPositionsEnum positions; + + PositionsEnum positions() throws IOException { + + if (positions == null) + // Lazy init + positions = new SegmentPositionsEnum(); + + return positions; + } + + // nocommit -- should we have different reader for + // payload vs no payload? + class SegmentPositionsEnum extends PositionsEnum { + + // nocommit + String desc; + + final IndexInput posIn; + final IndexInput payloadIn; + + final boolean storePayloads; + + boolean seekPending; // True if we must seek before reading next position + boolean payloadPending; // True if we must skip payload beore reading next position + + long skipPosOffset; + long skipPayloadOffset; + int skipPosCount; + + int position; + int payloadLength; + + SegmentPositionsEnum() { + if (DEBUG) + System.out.println("new pos enum"); + posIn = (IndexInput) FormatSepPositionsReader.this.posIn.clone(); + storePayloads = fieldInfo.storePayloads; + if (storePayloads) + payloadIn = (IndexInput) FormatSepPositionsReader.this.payloadIn.clone(); + else + payloadIn = null; + } + + void skip(long posOffset, long payloadOffset, int lastPayloadLength, int numPositions) { + skipPosOffset = posOffset; + skipPayloadOffset = payloadOffset; + payloadLength = lastPayloadLength; + assert payloadLength >= 0 || payloadLength == -1; + skipPosCount = numPositions; + seekPending = true; + payloadPending = false; + if (DEBUG) + System.out.println("pr [" + desc + "] skip posFP= " + posOffset + " payloadFP=" + payloadOffset + " numPositions=" + numPositions); + } + + void skip(int numPositions) { + skipPosCount += numPositions; + if (DEBUG) + System.out.println("pr [" + desc + "] skip " + numPositions + " positions; now " + skipPosCount); + } + + void catchUp(int currentCount) throws IOException { + if (DEBUG) + System.out.println(" pos catchup: seekPending=" + seekPending + " skipPosFP=" + skipPosOffset + " skipPayloadFP=" + skipPayloadOffset + " skipPosCount " + skipPosCount + " vs currentCount " + currentCount); + if (seekPending) { + posIn.seek(skipPosOffset); + if (storePayloads) + payloadIn.seek(skipPayloadOffset); + seekPending = false; + } + + while(skipPosCount > currentCount) + next(); + if (DEBUG) + System.out.println(" pos catchup done"); + positions.init(); + } + + void init() { + if (DEBUG) + System.out.println(" pos init"); + position = 0; + } + + int next() throws IOException { + + if (DEBUG) + System.out.println(" pr.next [" + desc + "]: posFP=" + posIn.getFilePointer() + " return pos=" + position); + + final int code = posIn.readVInt(); + + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + if (DEBUG) + System.out.println(" payload pending: skip " + payloadLength + " bytes"); + // TODO: we could do this lazily, when + // getPayload() is called + payloadIn.seek(payloadIn.getFilePointer()+payloadLength); + } + + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = posIn.readVInt(); + assert payloadLength >= 0; + if (DEBUG) + System.out.println(" new payloadLen=" + payloadLength); + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else + position += code; + + skipPosCount--; + + // NOTE: the old API actually allowed this... + assert skipPosCount >= 0: "next() was called too many times (more than FormatPostingsDocsEnum.freq() times)"; + + if (DEBUG) + System.out.println(" proxFP=" + posIn.getFilePointer() + " return pos=" + position); + + return position; + } + + int getPayloadLength() { + return payloadLength; + } + + byte[] getPayload(byte[] data, int offset) throws IOException { + + if (!payloadPending) + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + + final byte[] retArray; + final int retOffset; + if (data == null || data.length-offset < payloadLength) { + // the array is too small to store the payload data, + // so we allocate a new one + retArray = new byte[payloadLength]; + retOffset = 0; + } else { + retArray = data; + retOffset = offset; + } + + payloadIn.readBytes(retArray, retOffset, payloadLength); + payloadPending = false; + return retArray; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatSepPositionsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) @@ -0,0 +1,250 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; + +/** + * Writes terms dict and interacts with docs/positions + * consumers to write the postings files. + * + * The [new] terms dict format is field-centric: each field + * has its own section in the file. Fields are written in + * UTF16 string comparison order. Within each field, each + * term's text is written in UTF16 string comparison order. + */ + +class FormatPostingsTermsDictWriter extends FieldsConsumer { + + // Initial format + public static final int FORMAT = -1; + + public static final int FORMAT_CURRENT = FORMAT; + + private final int indexInterval; + private final DeltaBytesWriter termWriter; + private final DeltaBytesWriter termIndexWriter; + + final IndexOutput out; + final IndexOutput indexOut; + final DocsConsumer consumer; + final FieldInfos fieldInfos; + FieldInfo currentField; + + private List fields = new ArrayList(); + + // nocommit + private String segment; + + FormatPostingsTermsDictWriter(SegmentWriteState state, DocsConsumer consumer) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName); + state.flushedFiles.add(termsFileName); + this.segment = state.segmentName; + + if (PostingsCodec.DEBUG) { + System.out.println("tdw: write to segment=" + state.segmentName); + new Throwable().printStackTrace(System.out); + } + + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexFileName); + state.flushedFiles.add(indexFileName); + + fieldInfos = state.fieldInfos; + indexInterval = state.termIndexInterval; + + // Count indexed fields up front + final int numFields = fieldInfos.size(); + + out.writeInt(FORMAT_CURRENT); // write format + out.writeLong(0); // leave space for end index pointer + out.writeInt(indexInterval); // write indexInterval + + termWriter = new DeltaBytesWriter(out); + termIndexWriter = new DeltaBytesWriter(indexOut); + currentField = null; + this.consumer = consumer; + + consumer.start(out); // have consumer write its format/header + } + + TermsConsumer addField(FieldInfo field) { + if (PostingsCodec.DEBUG) + System.out.println("tdw.addField: field=" + field.name); + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + TermsConsumer terms = new TermsWriter(field, consumer); + fields.add(terms); + return terms; + } + + void close() throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("tdw.close seg=" + segment); + try { + final long indexPointer = out.getFilePointer(); + final int fieldCount = fields.size(); + + if (PostingsCodec.DEBUG) + System.out.println(" numFields=" + fieldCount); + + out.writeInt(fieldCount); + for(int i=0;i>>1; + if ((code & 1) != 0) + doc.numPositions = 1; + else + doc.numPositions = termsIn.readVInt(); + + if (doc.numPositions > doc.positions.length) + doc.reallocPositions(doc.numPositions); + + int position = 0; + int payloadLength = -1; + + for(int j=0;j>> 1; + if ((code2 & 1) != 0) + payloadLength = termsIn.readVInt(); + if (payloadLength > 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + termsIn.readBytes(pos.payload, 0, payloadLength); + } + } else + position += code2; + pos.pos = position; + pos.payloadLength = payloadLength; + } + } + doc.docID = docID; + } + + } else { + if (PostingsCodec.DEBUG) + System.out.println(" not pulsed pass isIndex=" + pendingIndexTerm); + + postingsReader.readTerm(docFreq, pendingIndexTerm); + pendingIndexTerm = false; + } + } + + public void close() throws IOException { + postingsReader.close(); + } + + final PulsingDocsEnum docsEnum = new PulsingDocsEnum(); + + DocsEnum docs(BitVector deletedDocs) throws IOException { + if (docFreq <= maxPulsingDocFreq) { + docsEnum.reset(deletedDocs); + return docsEnum; + } else + return postingsReader.docs(deletedDocs); + } + + class PulsingDocsEnum extends DocsEnum { + int nextRead; + private BitVector deletedDocs; + private Document doc; + + public void close() {} + + void reset(BitVector deletedDocs) { + this.deletedDocs = deletedDocs; + nextRead = 0; + } + + public int next() { + while(true) { + if (nextRead >= docFreq) + return -1; + else { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) + return doc.docID; + } + } + } + + public int read(int[] retDocs, int[] retFreqs) { + final int limit; + int i=0; + // nocommit -- ob1? + while(nextRead < docFreq) { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) { + retDocs[i] = doc.docID; + if (omitTF) + retFreqs[i] = 0; + else + retFreqs[i] = doc.numPositions; + i++; + } + } + return i; + } + + public int ord() { + assert nextRead <= docFreq; + return nextRead-1; + } + + public int freq() { + return doc.numPositions; + } + + class PulsingPositionsEnum extends PositionsEnum { + int nextRead; + FormatPulsingDocsWriter.Position pos; + + void reset() { + nextRead = 0; + } + + public int next() { + assert nextRead < doc.numPositions; + pos = doc.positions[nextRead++]; + return pos.pos; + } + + public int getPayloadLength() { + return pos.payloadLength; + } + + public boolean hasPayload() { + return pos.payloadLength > 0; + } + + public byte[] getPayload(byte[] data, int offset) { + // nocommit -- inefficient + System.arraycopy(pos.payload, 0, data, offset, pos.payloadLength); + return data; + } + } + + final PulsingPositionsEnum positions = new PulsingPositionsEnum(); + + public PositionsEnum positions() throws IOException { + positions.reset(); + return positions; + } + + public int skipTo(int target) throws IOException { + int doc; + while((doc=next()) != -1) { + if (doc >= target) + return doc; + } + return -1; + } + } + } +} Index: src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) @@ -0,0 +1,277 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Consumes doc & freq, writing them using the current + * index file format */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexOutput; + +final class FormatPulsingDocsWriter extends DocsConsumer { + + final static String CODEC = "PulsedPostings"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + IndexOutput termsOut; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + FieldInfo fieldInfo; + + // nocommit + String desc; + + static class Document { + int docID; + int termDocFreq; + int numPositions; + Position[] positions; + Document() { + positions = new Position[1]; + positions[0] = new Position(); + } + + void reallocPositions(int minSize) { + final Position[] newArray = new Position[ArrayUtil.getNextSize(minSize)]; + System.arraycopy(positions, 0, newArray, 0, positions.length); + for(int i=positions.length;i maxPulsingDocFreq docs + + static class Position { + byte[] payload; + int pos; + int payloadLength; + } + + // nocommit -- lazy init this? ie, if every single term + // was pulsed then we never need to use this fallback? + // Fallback writer for non-pulsed terms: + final DocsConsumer postingsDocsWriter; + + /** If docFreq <= maxPulsingDocFreq, its postings are + * inlined into terms dict */ + + FormatPulsingDocsWriter(SegmentWriteState state, int maxPulsingDocFreq, DocsConsumer postingsDocsWriter) throws IOException { + super(); + + pendingDocs = new Document[maxPulsingDocFreq]; + for(int i=0;i 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + System.arraycopy(payload, payloadOffset, pos.payload, 0, payloadLength); + pos.payloadLength = payloadLength; + } else + pos.payloadLength = 0; + } + void finishDoc() { + assert currentDoc.numPositions == currentDoc.termDocFreq; + } + void finishTerm(boolean isIndexTerm) {} + void close() {} + } + + final PositionsWriter posWriter = new PositionsWriter(); + + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + + assert docID >= 0: "got docID=" + docID; + + if (PostingsCodec.DEBUG) + System.out.println("PW.addDoc: docID=" + docID + " pendingDocCount=" + pendingDocCount + " vs " + pendingDocs.length + " pulsed=" + pulsed); + + if (!pulsed && pendingDocCount == pendingDocs.length) { + + // OK we just crossed the threshold, this term should + // now be "pulsed" into the main postings codec: + postingsDocsWriter.startTerm(); + if (PostingsCodec.DEBUG) + System.out.println(" now flush buffer"); + for(int i=0;i= target */ + public abstract int skipTo(int target) throws IOException; + + /** Returns the next docID, or -1 at the end. */ + public abstract int next() throws IOException; + + public abstract int freq(); + + public abstract int ord(); + + /** Bulk read: returns number of docs read. */ + public abstract int read(int[] docs, int[] freqs) throws IOException; + + // nocommit -- maybe move this up to TermsEnum? that + // would disallow changing positions format/reader of each + // doc, though + /** Don't call next() or skipTo() or read() until you're + * done consuming the positions */ + public abstract PositionsEnum positions() throws IOException; + + //public abstract void close() throws IOException; +} Index: src/java/org/apache/lucene/index/PulsingCodec.java =================================================================== --- src/java/org/apache/lucene/index/PulsingCodec.java (revision 0) +++ src/java/org/apache/lucene/index/PulsingCodec.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; + +class PulsingCodec extends PostingsCodec { + + PulsingCodec() { + name = "Pulsing"; + } + + FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + DocsConsumer docsWriter = new FormatPostingsDocsWriter(state); + boolean success = false; + try { + DocsConsumer pulsingWriter = new FormatPulsingDocsWriter(state, 1, docsWriter); + FieldsConsumer ret = new FormatPostingsTermsDictWriter(state, pulsingWriter); + success = true; + return ret; + } finally { + if (!success) + docsWriter.close(); + } + } + + FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize) throws IOException { + FormatPostingsTermsDictDocsReader docs = new FormatPostingsDocsReader(dir, si, readBufferSize); + boolean success = false; + try { + FormatPostingsTermsDictDocsReader docsReader = new FormatPulsingDocsReader(dir, si, readBufferSize, docs); + FieldsProducer ret = new FormatPostingsTermsDictReader(dir, fieldInfos, si.name, + docsReader, + readBufferSize); + success = true; + return ret; + } finally { + if (!success) + docs.close(); + } + } + + void files(SegmentInfo segmentInfo, Collection files) { + FormatPulsingDocsReader.files(segmentInfo, files); + FormatPostingsTermsDictReader.files(segmentInfo, files); + } +} Property changes on: src/java/org/apache/lucene/index/PulsingCodec.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** Abstract API that consumes terms, doc, freq, prox and - * payloads postings. Concrete implementations of this - * actually do "something" with the postings (write it into - * the index in a specific format). - * - * NOTE: this API is experimental and will likely change - */ -abstract class FormatPostingsFieldsConsumer { - - /** Add a new field */ - abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; - - /** Called when we are done adding everything. */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Abstract API that consumes terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" with the postings (write it into + * the index in a specific format). + * + * NOTE: this API is experimental and will likely change + */ +abstract class FieldsConsumer { + + /** Add a new field */ + abstract TermsConsumer addField(FieldInfo field) throws IOException; + + /** Called when we are done adding everything. */ + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FieldsConsumer.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatSepDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepDocsReader.java (revision 0) @@ -0,0 +1,442 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatSepDocsReader extends FormatPostingsTermsDictDocsReader { + + final IndexInput freqIn; + final IndexInput docIn; + final IndexInput skipIn; + + IndexInput termsIn; + + private final FormatSepPositionsReader posReader; + + int skipInterval; + int maxSkipLevels; + + FormatSepDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + + boolean success = false; + try { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION), readBufferSize); + docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "doc"), readBufferSize); + skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "skp"), readBufferSize); + if (segmentInfo.getHasProx()) + posReader = new FormatSepPositionsReader(dir, segmentInfo, readBufferSize); + else + posReader = null; + success = true; + } finally { + if (!success) + close(); + } + } + + static void files(SegmentInfo segmentInfo, Collection files) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "doc")); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, "skp")); + FormatSepPositionsReader.files(segmentInfo, files); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // Make sure we are talking to the matching past writer + PostingsCodec.checkHeader(termsIn, FormatSepDocsWriter.CODEC, FormatSepPositionsWriter.VERSION_START); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + if (posReader != null) + posReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + + final FormatSepPositionsReader.TermsDictReader posReader2; + if (posReader != null && !fieldInfo.omitTf) + posReader2 = (FormatSepPositionsReader.TermsDictReader) posReader.reader(fieldInfo, termsIn); + else + posReader2 = null; + + return new TermsDictReader(fieldInfo, posReader2, termsIn); + } + + void close() throws IOException { + try { + if (freqIn != null) + freqIn.close(); + } finally { + try { + if (docIn != null) + docIn.close(); + } finally { + try { + if (skipIn != null) + skipIn.close(); + } finally { + if (posReader != null) + posReader.close(); + } + } + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long freqOffset; + long docOffset; + long skipOffset; + int docFreq; + + // TODO: abstraction violation (we are storing this with + // the concrete impl. as the type, not the abstract base + // class) + final FormatSepPositionsReader.TermsDictReader posReader; + private SegmentDocsEnum docs; + + TermsDictReader(FieldInfo fieldInfo, FormatSepPositionsReader.TermsDictReader posReader, IndexInput termsIn) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.posReader = posReader; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + + this.docFreq = docFreq; + if (PostingsCodec.DEBUG) { + System.out.println(" dr.readTerm termsFP=" + termsIn.getFilePointer() + " df=" + docFreq + " isIndex=" + isIndexTerm); + System.out.println(" start freqFP=" + freqOffset + " docFP=" + docOffset + " skipFP=" + skipOffset); + } + + if (isIndexTerm) { + freqOffset = termsIn.readVLong(); + docOffset = termsIn.readVLong(); + skipOffset = termsIn.readVLong(); + } else { + freqOffset += termsIn.readVLong(); + docOffset += termsIn.readVLong(); + if (docFreq >= skipInterval) + skipOffset += termsIn.readVLong(); + } + + if (PostingsCodec.DEBUG) + System.out.println(" freqFP=" + freqOffset + " docFP=" + docOffset + " skipFP=" + skipOffset); + + if (posReader != null) + posReader.readTerm(docFreq, isIndexTerm); + } + + public void close() throws IOException { + if (posReader != null) + posReader.close(); + } + + DocsEnum docs(BitVector deletedDocs) throws IOException { + + if (docs == null) + // Lazy init + docs = new SegmentDocsEnum(); + + docs.init(deletedDocs); + + return docs; + } + + class SegmentDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long freqStart; + final IndexInput freqIn; + final IndexInput docIn; + + // nocommit -- should we do omitTF with 2 different enum classes? + final boolean omitTF; + private BitVector deletedDocs; + + // nocommit -- should we do hasProx with 2 different enum classes? + + boolean skipped; + SepSkipListReader skipper; + + // TODO: abstraction violation: we are storing the + // concrete impl, not the abstract base class + FormatSepPositionsReader.TermsDictReader.SegmentPositionsEnum positions; + + SegmentDocsEnum() { + if (PostingsCodec.DEBUG) + System.out.println("new docs enum"); + + this.docIn = (IndexInput) FormatSepDocsReader.this.docIn.clone(); + omitTF = fieldInfo.omitTf; + if (!omitTF) + this.freqIn = (IndexInput) FormatSepDocsReader.this.freqIn.clone(); + else { + this.freqIn = null; + freq = 1; + } + } + + public void close() {} + + void init(BitVector deletedDocs) throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("[" + desc + "] dr.init freqIn seek " + freqOffset + " this=" + this + " (in=" + freqIn + "; this=" + this + ")"); + this.deletedDocs = deletedDocs; + docIn.seek(docOffset); + if (!omitTF) + freqIn.seek(freqOffset); + this.docFreq = TermsDictReader.this.docFreq; + count = 0; + doc = 0; + skipped = false; + proxSkipFreq = 0; + + // maybe not necessary? + proxSkipPayloadLength = -1; + + // TODO: abstraction violation + if (posReader != null) { + posOffset = posReader.posOffset; + payloadOffset = posReader.payloadOffset; + } + } + + public int next() throws IOException { + + if (PostingsCodec.DEBUG) { + + if (!omitTF) + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " freqFP=" + freqIn.getFilePointer() + " docFP=" + docIn.getFilePointer() + " deletes?=" + (deletedDocs != null) ); + else + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " docFP=" + docIn.getFilePointer() + " deletes?=" + (deletedDocs != null) ); + } + + // new Throwable().printStackTrace(System.out); + + while(true) { + if (count == docFreq) + return -1; + + count++; + + // Decode next doc + doc += docIn.readVInt(); + + if (!omitTF) { + freq = freqIn.readVInt(); + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) + break; + else if (PostingsCodec.DEBUG) + System.out.println(" doc=" + doc + " is deleted"); + } + + // nocommit + if (PostingsCodec.DEBUG) { + if (positions != null) + positions.desc = desc + ":" + doc; + System.out.println(" return doc=" + doc); + } + return doc; + } + + public int read(int[] docs, int[] freqs) throws IOException { + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + doc += docIn.readVInt(); + if (!omitTF) { + freq = freqIn.readVInt(); + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + i++; + } + } + + return i; + } + + public int ord() { + assert count > 0; + return count-1; + } + + public int freq() { + return freq; + } + + long posOffset; + long payloadOffset; + int proxSkipPayloadLength = -1; + int proxSkipFreq; + PositionsEnum fakePositions; + + public PositionsEnum positions() throws IOException { + if (positions == null) { + // Lazy init + if (posReader == null) { + // TermFreq was omitted from this field during + // indexing, which means we pretend termFreq is + // always 1 with that 1 occurrence having + // position 0 + if (fakePositions == null) + fakePositions = new FakePositionsEnum(); + return fakePositions; + } else { + // TODO: abstraction violation + positions = (FormatSepPositionsReader.TermsDictReader.SegmentPositionsEnum) posReader.positions(); + if (PostingsCodec.DEBUG) + System.out.println("pos skip posOffset=" + posOffset + " payloadlen=" + proxSkipPayloadLength + " skipPosCount= " + proxSkipFreq); + positions.skip(posOffset, payloadOffset, proxSkipPayloadLength, proxSkipFreq); + } + } + + if (PostingsCodec.DEBUG) + positions.desc = desc + ":" + doc; + + positions.catchUp(freq); + + return positions; + } + + public int skipTo(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skip to target=" + target); + + if (docFreq >= skipInterval) { + + // There are enough docs in the posting to have + // skip data + if (skipper == null) + // Lazy init + skipper = new SepSkipListReader((IndexInput) skipIn.clone(), maxSkipLevels, skipInterval); + + if (!skipped) { + + // We haven't already skipped for this posting, + // so now we init the skipper + + // TODO: this is abstraction violation; instead, + // skipper should interact with this as a + // private consumer + skipper.init(skipOffset, + docOffset, freqOffset, posOffset, payloadOffset, + docFreq, fieldInfo.storePayloads); + + if (PostingsCodec.DEBUG) + System.out.println(" skip reader base skipFP=" + skipOffset + " docFP=" + docOffset + " freqFP=" + freqOffset + " proxFP=" + posOffset + " payloadFP=" + payloadOffset); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + if (PostingsCodec.DEBUG) + System.out.println("dr [" + desc + "]: skipper moved to newCount=" + newCount + " docFP=" + skipper.getDocPointer() + " freqFP=" + skipper.getFreqPointer() + " posFP=" + skipper.getPosPointer() + " payloadFP=" + skipper.getPayloadPointer() + " doc=" + skipper.getDoc()); + + // Skipper did move + if (!omitTF) + freqIn.seek(skipper.getFreqPointer()); + docIn.seek(skipper.getDocPointer()); + count = newCount; + doc = skipper.getDoc(); + + // TODO: abstraction violation; this should be a + // private interaction b/w skipper & posReader + if (positions != null) + // nocommit -- should that be count? + positions.skip(skipper.getPosPointer(), skipper.getPayloadPointer(), skipper.getPayloadLength(), 0); + else { + posOffset = skipper.getPosPointer(); + payloadOffset = skipper.getPayloadPointer(); + proxSkipPayloadLength = skipper.getPayloadLength(); + // nocommit -- should that be count? + proxSkipFreq = 0; + } + } else if (PostingsCodec.DEBUG) + System.out.println(" no skipping to be done"); + } + + // Now, linear scan for the rest: + do { + if (next() == -1) + return -1; + } while (target > doc); + + return doc; + } + } + } +} + +/** Returned when someone asks for positions() enum on field + * with omitTf true */ +class FakePositionsEnum extends PositionsEnum { + int next() { + return 0; + } + int getPayloadLength() { + return 0; + } + boolean hasPayload() { + return false; + } + byte[] getPayload(byte[] data, int offset) { + return null; + } + } Property changes on: src/java/org/apache/lucene/index/FormatSepDocsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 747337) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -68,6 +69,9 @@ when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; + private final PostingsCodecs codecs; + private PostingsCodec codec; + /** This ctor used only by test code. * * @param dir The Directory to merge the other segments into @@ -76,10 +80,12 @@ SegmentMerger(Directory dir, String name) { directory = dir; segment = name; + codecs = PostingsCodecs.getDefault(); } - SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { + SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge, PostingsCodecs codecs) { directory = writer.getDirectory(); + this.codecs = codecs; segment = name; if (merge != null) checkAbort = new CheckAbort(merge, directory); @@ -159,26 +165,37 @@ } } - final List createCompoundFile(String fileName) + final List createCompoundFile(String fileName) throws IOException { + // nocommit -- messy! + final SegmentWriteState state = new SegmentWriteState(null, directory, segment, fieldInfos, null, mergedDocs, 0, 0, PostingsCodecs.getDefault()); + return createCompoundFile(fileName, new SegmentInfo(segment, mergedDocs, directory, + PostingsCodecs.getDefault().getWriter(state))); + } + + final List createCompoundFile(String fileName, final SegmentInfo info) throws IOException { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); - List files = - new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); - + List files = new ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); + // Basic files - for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { - String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; + for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC.length; i++) { + String ext = IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC[i]; + // nocommit + /* if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) continue; + */ if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) files.add(segment + "." + ext); } + codec.files(info, files); + // Fieldable norm files for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); @@ -477,13 +494,21 @@ throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption"); } - private SegmentMergeQueue queue = null; + private SegmentMergeQueue queue; + PostingsCodec getCodec() { + return codec; + } + private final void mergeTerms() throws CorruptIndexException, IOException { - SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + SegmentWriteState state = new SegmentWriteState(null, directory, segment, fieldInfos, null, mergedDocs, 0, termIndexInterval, codecs); - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + // Let Codecs decide which codec will be used to write + // this segment: + codec = codecs.getWriter(state); + + final FieldsConsumer consumer = codec.fieldsConsumer(state); try { queue = new SegmentMergeQueue(readers.size()); @@ -491,14 +516,14 @@ mergeTermInfos(consumer); } finally { - consumer.finish(); + consumer.close(); if (queue != null) queue.close(); } } boolean omitTF; - private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { + private final void mergeTermInfos(final FieldsConsumer consumer) throws CorruptIndexException, IOException { int base = 0; final int readerCount = readers.size(); for (int i = 0; i < readerCount; i++) { @@ -525,7 +550,7 @@ SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; String currentField = null; - FormatPostingsTermsConsumer termsConsumer = null; + TermsConsumer termsConsumer = null; while (queue.size() > 0) { int matchSize = 0; // pop matching terms @@ -546,6 +571,9 @@ termsConsumer = consumer.addField(fieldInfo); omitTF = fieldInfo.omitTf; } + + if (PostingsCodec.DEBUG) + System.out.println("merge term=" + term); int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo @@ -572,6 +600,8 @@ return delCounts; } + private char[] termBuffer; + /** Process postings from multiple segments all positioned on the * same term. Writes out merged entries into freqOutput and * the proxOutput streams. @@ -582,10 +612,17 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) + private final int appendPostings(final TermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) throws CorruptIndexException, IOException { - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text); + final String text = smis[0].term.text; + final int len = text.length(); + if (termBuffer == null || termBuffer.length < 1+len) + termBuffer = new char[ArrayUtil.getNextSize(1+len)]; + text.getChars(0, len, termBuffer, 0); + termBuffer[len] = 0xffff; + + final DocsConsumer docConsumer = termsConsumer.startTerm(termBuffer, 0); int df = 0; for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; @@ -598,13 +635,18 @@ while (postings.next()) { df++; int doc = postings.doc(); - if (docMap != null) + if (docMap != null) { doc = docMap[doc]; // map around deletions + assert doc != -1: "postings enum returned deleted docID " + postings.doc() + " freq=" + postings.freq() + " df=" + df; + } doc += base; // convert to merged space final int freq = postings.freq(); - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + final PositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + // nocommit -- omitTF should be "private", and this + // code (and FreqProxTermsWriter) should instead + // check if posConsumer is null? if (!omitTF) { for (int j = 0; j < freq; j++) { final int position = postings.nextPosition(); @@ -616,12 +658,13 @@ } posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } - posConsumer.finish(); + posConsumer.finishDoc(); } } } - docConsumer.finish(); + termsConsumer.finishTerm(termBuffer, 0, df); + return df; } Index: src/java/org/apache/lucene/index/FormatSepDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatSepDocsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatSepDocsWriter.java (revision 0) @@ -0,0 +1,231 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.store.IndexOutput; + +/** Writes frq to .frq, docs to .doc, pos to .pos, paylaods + * to .pyl, skip data inlined into terms dict */ + +final class FormatSepDocsWriter extends DocsConsumer { + final static String CODEC = "SepDocFreqSkip"; + + // Increment version to change it: + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + final IndexOutput freqOut; + final IndexOutput docOut; + final IndexOutput skipOut; + IndexOutput termsOut; + + final FormatSepPositionsWriter posWriter; + final SepSkipListWriter skipListWriter; + final int skipInterval; + final int maxSkipLevels; + final int totalNumDocs; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + long lastFreqStart; + long freqStart; + long lastDocStart; + long docStart; + long lastSkipStart; + + FieldInfo fieldInfo; + + FormatSepDocsWriter(SegmentWriteState state) throws IOException { + super(); + + final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, "frq"); + state.flushedFiles.add(frqFileName); + freqOut = state.directory.createOutput(frqFileName); + + final String docFileName = IndexFileNames.segmentFileName(state.segmentName, "doc"); + state.flushedFiles.add(docFileName); + docOut = state.directory.createOutput(docFileName); + + final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, "skp"); + state.flushedFiles.add(skipFileName); + skipOut = state.directory.createOutput(skipFileName); + + if (PostingsCodec.DEBUG) + System.out.println("dw.init: create frq=" + frqFileName + " doc=" + docFileName + " skip=" + skipFileName); + + totalNumDocs = state.numDocs; + + // nocommit -- abstraction violation + skipListWriter = new SepSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + freqOut, docOut, + null, null); + + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + + posWriter = new FormatSepPositionsWriter(state, this); + } + + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + PostingsCodec.writeHeader(termsOut, CODEC, VERSION_CURRENT); + // nocommit -- just ask skipper to "start" here + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + posWriter.start(termsOut); + } + + void startTerm() { + freqStart = freqOut.getFilePointer(); + docStart = docOut.getFilePointer(); + if (!omitTF) + posWriter.startTerm(); + skipListWriter.resetSkip(); + } + + // nocommit -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes + void setField(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTf; + storePayloads = fieldInfo.storePayloads; + posWriter.setField(fieldInfo); + } + + int lastDocID; + int df; + + int count; + + /** Adds a new doc in this term. If this returns null + * then we just skip consuming positions/payloads. */ + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + + final int delta = docID - lastDocID; + + if (PostingsCodec.DEBUG) + System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq + " freqPointer=" + freqOut.getFilePointer()); + + if (docID < 0 || (df > 0 && delta <= 0)) + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + + if ((++df % skipInterval) == 0) { + // TODO: abstraction violation + // nocommit -- awkward we have to make these two + // separate calls to skipper + skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); + skipListWriter.bufferSkip(df); + if (PostingsCodec.DEBUG) + System.out.println(" bufferSkip lastDocID=" + lastDocID + " df=" + df + " freqFP=" + freqOut.getFilePointer() + " docFP=" + docOut.getFilePointer() + " posFP=" + skipListWriter.posOutput.getFilePointer() + " payloadFP=" + skipListWriter.payloadOutput.getFilePointer() + " payloadLen=" + posWriter.lastPayloadLength); + } + + // nocommit -- move this assert up above; every consumer + // shouldn't have to check for this bug: + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + lastDocID = docID; + docOut.writeVInt(delta); + if (!omitTF) + freqOut.writeVInt(termDocFreq); + + // nocommit + if (PostingsCodec.DEBUG) + ((FormatSepPositionsWriter) posWriter).desc = desc + ":" + docID; + + if (omitTF) + return null; + else + return posWriter; + } + + /** Called when we are done adding docs to this term */ + void finishTerm(int docCount, boolean isIndexTerm) throws IOException { + + long skipPos = skipOut.getFilePointer(); + + // nocommit -- wasteful we are counting this in two places? + assert docCount == df; + if (PostingsCodec.DEBUG) + System.out.println("dw.finishTerm termsFP=" + termsOut.getFilePointer() + " freqStart=" + freqStart + " df=" + df + " skipPos=" + skipPos); + + if (isIndexTerm) { + // Write absolute at seek points + termsOut.writeVLong(freqStart); + termsOut.writeVLong(docStart); + } else { + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); + termsOut.writeVLong(docStart - lastDocStart); + } + + if (df >= skipInterval) { + if (PostingsCodec.DEBUG) + System.out.println(" writeSkip @ docFp=" + docOut.getFilePointer() + " freqFP=" + freqOut.getFilePointer() + " freqStartFP=" + freqStart + " skipPos=" + skipPos + " lastSkipPos=" + lastSkipStart); + + skipListWriter.writeSkip(skipOut); + } + + if (isIndexTerm) { + termsOut.writeVLong(skipPos); + lastSkipStart = skipPos; + } else if (df >= skipInterval) { + termsOut.writeVLong(skipPos-lastSkipStart); + lastSkipStart = skipPos; + } + + lastFreqStart = freqStart; + lastDocStart = docStart; + + if (!omitTF) + posWriter.finishTerm(isIndexTerm); + + lastDocID = 0; + df = 0; + + // nocommit + count = 0; + } + + void close() throws IOException { + if (PostingsCodec.DEBUG) + System.out.println("dw.close freqFP=" + freqOut.getFilePointer() + " docFP=" + docOut.getFilePointer() + " skipFP=" + skipOut.getFilePointer()); + try { + freqOut.close(); + } finally { + try { + docOut.close(); + } finally { + try { + skipOut.close(); + } finally { + posWriter.close(); + } + } + } + } +} Property changes on: src/java/org/apache/lucene/index/FormatSepDocsWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -88,21 +88,23 @@ } } + final int numAllFields = allFields.size(); + // Sort by field name Collections.sort(allFields); - final int numAllFields = allFields.size(); - // TODO: allow Lucene user to customize this consumer: - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + // TODO: allow Lucene user to customize this codec: + final FieldsConsumer consumer = state.codec.fieldsConsumer(state); + /* Current writer chain: - FormatPostingsFieldsConsumer - -> IMPL: FormatPostingsFieldsWriter - -> FormatPostingsTermsConsumer - -> IMPL: FormatPostingsTermsWriter - -> FormatPostingsDocConsumer - -> IMPL: FormatPostingsDocWriter - -> FormatPostingsPositionsConsumer + FieldsConsumer + -> IMPL: FormatPostingsTermsDictWriter + -> TermsConsumer + -> IMPL: FormatPostingsTermsDictWriter.TermsWriter + -> DocsConsumer + -> IMPL: FormatPostingsDocsWriter + -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ @@ -145,8 +147,7 @@ FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } - - consumer.finish(); + consumer.close(); } private byte[] payloadBuffer; @@ -155,7 +156,7 @@ * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, - FormatPostingsFieldsConsumer consumer) + FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; @@ -172,7 +173,7 @@ assert result; } - final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); + final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; @@ -196,11 +197,15 @@ termStates[numToMerge++] = mergeStates[i]; } - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset); + final char[] termText = termStates[0].text; + final int termTextOffset = termStates[0].textOffset; + final DocsConsumer docConsumer = termsConsumer.startTerm(termText, termTextOffset); + // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. + int numDocs = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -209,8 +214,9 @@ minState = termStates[i]; final int termDocFreq = minState.termFreq; + numDocs++; - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); + final PositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); final ByteSliceReader prox = minState.prox; @@ -241,7 +247,7 @@ posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } //End for - posConsumer.finish(); + posConsumer.finishDoc(); } if (!minState.nextDoc()) { @@ -269,7 +275,7 @@ } } - docConsumer.finish(); + termsConsumer.finishTerm(termText, termTextOffset, numDocs); } termsConsumer.finish(); Index: src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- src/java/org/apache/lucene/index/TermsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/TermsEnum.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.AttributeSource; + +/** + * NOTE: this API is experimental and will likely change + */ + +public abstract class TermsEnum extends AttributeSource { + + // nocommit -- char[] or byte[] version? + /** Seeks to the specified term. Returns true if the term + * exists. */ + public abstract boolean seek(String text) throws IOException; + + // nocommit + // abstract boolean seek(int ord) throws IOException; + + // nocommit -- String or null? + /** Increments the enumeration to the next element. True if one exists.*/ + public abstract boolean next() throws IOException; + + // nocommit -- char[] or byte[] version? + /** Returns the text for current Term in the enumeration.*/ + public abstract String text(); + + /** Returns the docFreq of the current Term in the enumeration.*/ + public abstract int docFreq(); + + /** Not all impls will implement this, eg Multi*Reader + * will not */ + public abstract long ord(); + + /** Get DocsEnum for the current term. You should not + * call {@link #next()} or {@link #seek()} until you're + * done using the DocsEnum. */ + public abstract DocsEnum docs() throws IOException; + + // nocommit -- maybe no close method? + /** Closes the enumeration to further activity, freeing resources. */ + // public abstract void close() throws IOException; +} + Index: src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (working copy) @@ -1,71 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { - - final FormatPostingsFieldsWriter parent; - final FormatPostingsDocsWriter docsWriter; - final TermInfosWriter termsOut; - FieldInfo fieldInfo; - - FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { - super(); - this.parent = parent; - termsOut = parent.termsOut; - docsWriter = new FormatPostingsDocsWriter(state, this); - } - - void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - docsWriter.setField(fieldInfo); - } - - char[] currentTerm; - int currentTermStart; - - long freqStart; - long proxStart; - - /** Adds a new term in this field */ - FormatPostingsDocsConsumer addTerm(char[] text, int start) { - currentTerm = text; - currentTermStart = start; - - // TODO: this is abstraction violation -- ideally this - // terms writer is not so "invasive", looking for file - // pointers in its child consumers. - freqStart = docsWriter.out.getFilePointer(); - if (docsWriter.posWriter.out != null) - proxStart = docsWriter.posWriter.out.getFilePointer(); - - parent.skipListWriter.resetSkip(); - - return docsWriter; - } - - /** Called when we are done adding terms to this field */ - void finish() { - } - - void close() throws IOException { - docsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMergeInfo.java (revision 747337) +++ src/java/org/apache/lucene/index/SegmentMergeInfo.java (working copy) @@ -27,12 +27,19 @@ private TermPositions postings; // use getPositions() private int[] docMap; // use getDocMap() + // nocommit + private String segment; + SegmentMergeInfo(int b, TermEnum te, IndexReader r) throws IOException { base = b; reader = r; termEnum = te; + //segment = ((SegmentReader) r).segment; + // nocommit -- this is always null (te.next() isn't yet called) term = te.term(); + if (PostingsCodec.DEBUG) + System.out.println("smi create seg=" + segment); } // maps around deleted docs @@ -64,8 +71,12 @@ final boolean next() throws IOException { if (termEnum.next()) { term = termEnum.term(); + if (PostingsCodec.DEBUG) + System.out.println(" smi.next: term=" + term + " seg=" + segment); return true; } else { + if (PostingsCodec.DEBUG) + System.out.println(" smi.next: term=null seg=" + segment); term = null; return false; } Index: src/java/org/apache/lucene/index/PreFlexCodec.java =================================================================== --- src/java/org/apache/lucene/index/PreFlexCodec.java (revision 0) +++ src/java/org/apache/lucene/index/PreFlexCodec.java (revision 0) @@ -0,0 +1,45 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; + +/** Codec that reads the pre-flex-indexing postings + * format. It does not provide a writer because newly + * written segments should use DefaultCodec. */ +class PreFlexCodec extends PostingsCodec { + + PreFlexCodec() { + name = "PreFlex"; + } + + FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + throw new IllegalArgumentException("this codec can only be used for reading"); + } + + FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize) throws IOException { + return new PreFlexTermInfosReader(dir, fieldInfos, info, readBufferSize); + } + + void files(SegmentInfo info, Collection files) { + PreFlexTermInfosReader.files(info, files); + } +} Property changes on: src/java/org/apache/lucene/index/PreFlexCodec.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/FieldsProducer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.util.BitVector; + +/** Abstract API that provides terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" to read the postings from some + * store. + * + * NOTE: this API is experimental and will likely change + */ + +abstract class FieldsProducer { + abstract FieldsEnum fields(BitVector deletedDocs) throws IOException; + abstract void close() throws IOException; +} Index: src/java/org/apache/lucene/index/TermPositions.java =================================================================== --- src/java/org/apache/lucene/index/TermPositions.java (revision 747337) +++ src/java/org/apache/lucene/index/TermPositions.java (working copy) @@ -26,6 +26,7 @@ * positions of each occurrence of a term in a document. * * @see IndexReader#termPositions() + * @deprecated Use PositionsEnum instead */ public interface TermPositions Index: src/java/org/apache/lucene/index/TermInfo.java =================================================================== --- src/java/org/apache/lucene/index/TermInfo.java (revision 747337) +++ src/java/org/apache/lucene/index/TermInfo.java (working copy) @@ -17,7 +17,10 @@ * limitations under the License. */ -/** A TermInfo is the record of information stored for a term.*/ +/** A TermInfo is the record of information stored for a + * term + * @deprecated This class is no longer used in flexible + * indexing. */ final class TermInfo { /** The number of documents which contain the term. */ Index: src/java/org/apache/lucene/index/PostingsCodecs.java =================================================================== --- src/java/org/apache/lucene/index/PostingsCodecs.java (revision 0) +++ src/java/org/apache/lucene/index/PostingsCodecs.java (revision 0) @@ -0,0 +1,70 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; + +/** Holds a set of codecs, keyed by name. You subclass + * this, instantiate it, and register your codecs, then + * pass this instance to IndexReader/IndexWriter (via + * package private APIs) to use different codecs when + * reading & writing segments. */ + +abstract class PostingsCodecs { + + private final HashMap codecs = new HashMap(); + + void register(PostingsCodec codec) { + if (codec.name == null) + throw new IllegalArgumentException("code.name is null"); + if (!codecs.containsKey(codec.name)) { + codecs.put(codec.name, codec); + } else if (codecs.get(codec.name) != codec) + throw new IllegalArgumentException("codec '" + codec.name + "' is already registered as a different codec instance"); + } + + PostingsCodec lookup(String name) { + final PostingsCodec codec = (PostingsCodec) codecs.get(name); + if (codec == null) + throw new IllegalArgumentException("required codec '" + name + "' not found"); + return codec; + } + + abstract PostingsCodec getWriter(SegmentWriteState state); + + static private final PostingsCodecs defaultCodecs = new DefaultPostingsCodecs(); + + static PostingsCodecs getDefault() { + return defaultCodecs; + } +} + +class DefaultPostingsCodecs extends PostingsCodecs { + DefaultPostingsCodecs() { + register(new DefaultCodec()); + register(new PreFlexCodec()); + register(new PulsingCodec()); + register(new SepCodec()); + } + + PostingsCodec getWriter(SegmentWriteState state) { + return lookup("Default"); + //return lookup("Pulsing"); + //return lookup("Sep"); + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/PostingsCodecs.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 747337) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -27,8 +27,9 @@ /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the - * set. */ - + * set + * @deprecated This class has been replaced by + * FormatPostingsTermsDictReader, except for reading old segments. */ final class TermInfosReader { private Directory directory; private String segment; @@ -236,7 +237,10 @@ return ti; } } - + + // nocommit -- make sure these optimizations survive + // into flex + // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum enumerator = resources.termEnum; if (enumerator.term() != null // term is at or past current Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) @@ -0,0 +1,42 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; + +// nocommit -- bad name: this class never interacts directly +// w/ termsdict +abstract class FormatPostingsTermsDictPositionsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a pos enum for the last term read */ + abstract PositionsEnum positions() throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 747337) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; +import org.apache.lucene.util.BitVector; import org.apache.lucene.store.*; import java.io.File; @@ -203,7 +204,7 @@ * @throws IOException if there is a low-level IO error * @param path the path to the index directory */ public static IndexReader open(String path) throws CorruptIndexException, IOException { - return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT); + return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT, null); } /** Returns a read/write IndexReader reading the index in an FSDirectory in the named @@ -213,7 +214,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(File path) throws CorruptIndexException, IOException { - return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT); + return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT, null); } /** Returns a read/write IndexReader reading the index in @@ -224,7 +225,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException { - return open(directory, false, null, null, READ_ONLY_DEFAULT); + return open(directory, false, null, null, READ_ONLY_DEFAULT, null); } /** Returns a read/write or read only IndexReader reading the index in the given Directory. @@ -234,7 +235,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, false, null, null, readOnly); + return open(directory, false, null, null, readOnly, null); } /** Expert: returns a read/write IndexReader reading the index in the given @@ -245,7 +246,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), false, null, commit, READ_ONLY_DEFAULT); + return open(commit.getDirectory(), false, null, commit, READ_ONLY_DEFAULT, null); } /** Expert: returns a read/write IndexReader reading the index in the given @@ -260,7 +261,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException { - return open(directory, false, deletionPolicy, null, READ_ONLY_DEFAULT); + return open(directory, false, deletionPolicy, null, READ_ONLY_DEFAULT, null); } /** Expert: returns a read/write or read only IndexReader reading the index in the given @@ -276,7 +277,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(directory, false, deletionPolicy, null, readOnly); + return open(directory, false, deletionPolicy, null, readOnly, null); } /** Expert: returns a read/write IndexReader reading the index in the given @@ -293,7 +294,7 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), false, deletionPolicy, commit, READ_ONLY_DEFAULT); + return open(commit.getDirectory(), false, deletionPolicy, commit, READ_ONLY_DEFAULT, null); } /** Expert: returns a read/write or read only IndexReader reading the index in the given @@ -309,11 +310,16 @@ * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { - return open(commit.getDirectory(), false, deletionPolicy, commit, readOnly); + return open(commit.getDirectory(), false, deletionPolicy, commit, readOnly, null); } - private static IndexReader open(final Directory directory, final boolean closeDirectory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly) throws CorruptIndexException, IOException { - return DirectoryIndexReader.open(directory, closeDirectory, deletionPolicy, commit, readOnly); + static IndexReader open(final Directory directory, final boolean closeDirectory, + final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, + final boolean readOnly, PostingsCodecs codecs) throws CorruptIndexException, IOException { + + if (codecs == null) + codecs = PostingsCodecs.getDefault(); + return DirectoryIndexReader.open(directory, closeDirectory, deletionPolicy, commit, readOnly, codecs); } /** @@ -357,7 +363,7 @@ * @throws IOException if there is a low-level IO error */ public synchronized IndexReader reopen() throws CorruptIndexException, IOException { - throw new UnsupportedOperationException("This reader does not support reopen()."); + throw new UnsupportedOperationException("This reader does not support reopen(): " + this); } @@ -366,7 +372,7 @@ * unchanged but readOnly is different then a new reader * will be returned. */ public synchronized IndexReader reopen(boolean openReadOnly) throws CorruptIndexException, IOException { - throw new UnsupportedOperationException("This reader does not support reopen()."); + throw new UnsupportedOperationException("This reader does not support reopen(): " + this); } /** @@ -389,7 +395,7 @@ * @throws IOException if there is a low-level IO error */ public synchronized Object clone() { - throw new UnsupportedOperationException("This reader does not implement clone()"); + throw new UnsupportedOperationException("This reader does not implement clone(): " + this); } /** @@ -399,7 +405,7 @@ * @throws IOException if there is a low-level IO error */ public synchronized IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException { - throw new UnsupportedOperationException("This reader does not implement clone()"); + throw new UnsupportedOperationException("This reader does not implement clone(): " + this); } /** @@ -414,7 +420,7 @@ if (null != directory) { return directory; } else { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } } @@ -501,7 +507,7 @@ * @throws IOException if there is a low-level IO error */ public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentVersion(directory); + return SegmentInfos.readCurrentVersion(directory, PostingsCodecs.getDefault()); } /** @@ -519,7 +525,7 @@ * @see #getCommitUserData() */ public static String getCommitUserData(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentUserData(directory); + return SegmentInfos.readCurrentUserData(directory, PostingsCodecs.getDefault()); } /** @@ -527,7 +533,7 @@ * @throws UnsupportedOperationException unless overridden in subclass */ public long getVersion() { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /** @@ -539,7 +545,7 @@ * @see #getCommitUserData(Directory) */ public String getCommitUserData() { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /**

For IndexReader implementations that use @@ -560,7 +566,7 @@ * @throws IllegalStateException if the term index has already been loaded into memory */ public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /**

For IndexReader implementations that use @@ -568,7 +574,7 @@ * current indexDivisor. * @see #setTermInfosIndexDivisor */ public int getTermInfosIndexDivisor() { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /** @@ -590,7 +596,7 @@ * @throws UnsupportedOperationException unless overridden in subclass */ public boolean isCurrent() throws CorruptIndexException, IOException { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /** @@ -600,7 +606,7 @@ * @throws UnsupportedOperationException unless overridden in subclass */ public boolean isOptimized() { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method:" + this); } /** @@ -824,15 +830,22 @@ * on the resulting enumeration before calling other methods such as * {@link TermEnum#term()}. * @throws IOException if there is a low-level IO error + * @deprecated Use {@link #fields()} instead. */ public abstract TermEnum terms() throws IOException; + // Default impl emulates new API using old one + public FieldsEnum fields() throws IOException { + return new LegacyFieldsEnum(this); + } + /** Returns an enumeration of all terms starting at a given term. If * the given term does not exist, the enumeration is positioned at the * first term greater than the supplied term. The enumeration is * ordered by Term.compareTo(). Each term is greater than all that * precede it in the enumeration. * @throws IOException if there is a low-level IO error + * @deprecated nocommit */ public abstract TermEnum terms(Term t) throws IOException; @@ -853,6 +866,7 @@ *

The enumeration is ordered by document number. Each document number * is greater than all that precede it in the enumeration. * @throws IOException if there is a low-level IO error + * @deprecated Use {@link #termDocsEnum(Term)} instead. */ public TermDocs termDocs(Term term) throws IOException { ensureOpen(); @@ -861,6 +875,47 @@ return termDocs; } + private static class NullDocsEnum extends DocsEnum { + public int skipTo(int target) { + return -1; + } + public int next() { + return -1; + } + public int freq() { + return 1; + } + public int ord() { + return 0; + } + public int read(int[] docs, int[] freqs) { + return 0; + } + public PositionsEnum positions() { + return null; + } + public void close() { + } + } + private static final NullDocsEnum nullDocsEnum = new NullDocsEnum(); + + /** Returns DocsEnum for this term, or null if the field + * or term text do not exist in the index. */ + public DocsEnum termDocsEnum(Term term) throws IOException { + + // nocommit -- not good, because we don't close up the + // chain when docsEnum.close is called? + FieldsEnum fields = fields(); + if (fields.seek(term.field())) { + TermsEnum terms = fields.terms(); + if (terms.seek(term.text())) { + return terms.docs(); + } + } + + return nullDocsEnum; + } + /** Returns an unpositioned {@link TermDocs} enumerator. * @throws IOException if there is a low-level IO error */ @@ -1087,7 +1142,7 @@ * may suddenly change.

*/ public IndexCommit getIndexCommit() throws IOException { - throw new UnsupportedOperationException("This reader does not support this method."); + throw new UnsupportedOperationException("This reader does not support this method: " + this); } /** Index: src/java/org/apache/lucene/index/TermEnum.java =================================================================== --- src/java/org/apache/lucene/index/TermEnum.java (revision 747337) +++ src/java/org/apache/lucene/index/TermEnum.java (working copy) @@ -22,7 +22,8 @@ /** Abstract class for enumerating terms.

Term enumerations are always ordered by Term.compareTo(). Each term in - the enumeration is greater than all that precede it. */ + the enumeration is greater than all that precede it. +* @deprecated Use TermsEnum instead */ public abstract class TermEnum { /** Increments the enumeration to the next element. True if one exists.*/ Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -533,9 +533,16 @@ synchronized private void initFlushState(boolean onlyDocStore) { initSegmentName(onlyDocStore); - flushState = new SegmentWriteState(this, directory, segment, docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval()); + flushState = new SegmentWriteState(this, directory, segment, docFieldProcessor.fieldInfos, + docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval(), + writer.codecs); } + /** Returns the codec used to flush the last segment */ + PostingsCodec getCodec() { + return flushState.codec; + } + /** Flush all pending docs to a new segment */ synchronized int flush(boolean closeDocStore) throws IOException { @@ -601,8 +608,12 @@ CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); Iterator it = flushState.flushedFiles.iterator(); - while(it.hasNext()) - cfsWriter.addFile((String) it.next()); + while(it.hasNext()) { + final String fileName = (String) it.next(); + if (PostingsCodec.DEBUG) + System.out.println("make cfs " + fileName); + cfsWriter.addFile(fileName); + } // Perform the merge cfsWriter.close(); @@ -950,19 +961,19 @@ Entry entry = (Entry) iter.next(); Term term = (Term) entry.getKey(); - TermDocs docs = reader.termDocs(term); + DocsEnum docs = reader.termDocsEnum(term); if (docs != null) { int limit = ((BufferedDeletes.Num) entry.getValue()).getNum(); try { - while (docs.next()) { - int docID = docs.doc(); - if (docIDStart+docID >= limit) + while (true) { + final int docID = docs.next(); + if (docID == -1 || docIDStart+docID >= limit) break; reader.deleteDocument(docID); any = true; } } finally { - docs.close(); + //docs.close(); } } } @@ -980,14 +991,17 @@ // Delete by query IndexSearcher searcher = new IndexSearcher(reader); iter = deletesFlushed.queries.entrySet().iterator(); + //System.out.println("DW: flush delete by query"); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Query query = (Query) entry.getKey(); + //System.out.println("\n del query=" + query.toString()); int limit = ((Integer) entry.getValue()).intValue(); Weight weight = query.weight(searcher); Scorer scorer = weight.scorer(reader); while(scorer.next()) { final int docID = scorer.doc(); + //System.out.println(" del docID=" + docID); if (docIDStart + docID >= limit) break; reader.deleteDocument(docID); Index: src/java/org/apache/lucene/index/MultiSegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/MultiSegmentReader.java (revision 747337) +++ src/java/org/apache/lucene/index/MultiSegmentReader.java (working copy) @@ -41,8 +41,8 @@ private boolean hasDeletions = false; /** Construct reading the named set of readers. */ - MultiSegmentReader(Directory directory, SegmentInfos sis, boolean closeDirectory, boolean readOnly) throws IOException { - super(directory, sis, closeDirectory, readOnly); + MultiSegmentReader(Directory directory, SegmentInfos sis, boolean closeDirectory, boolean readOnly, PostingsCodecs codecs) throws IOException { + super(directory, sis, closeDirectory, readOnly, codecs); // To reduce the chance of hitting FileNotFound // (and having to retry), we open segments in @@ -71,8 +71,8 @@ /** This contructor is only used for {@link #reopen()} */ MultiSegmentReader(Directory directory, SegmentInfos infos, boolean closeDirectory, SegmentReader[] oldReaders, int[] oldStarts, - Map oldNormsCache, boolean readOnly, boolean doClone) throws IOException { - super(directory, infos, closeDirectory, readOnly); + Map oldNormsCache, boolean readOnly, boolean doClone, PostingsCodecs codecs) throws IOException { + super(directory, infos, closeDirectory, readOnly, codecs); // we put the old SegmentReaders in a map, that allows us // to lookup a reader using its segment name @@ -121,7 +121,7 @@ // SegmentInfos, so it does not attempt to // obtain the write lock newReader = (SegmentReader) newReaders[i].clone(readOnly); - newReader.init(directory, null, false, readOnly); + newReader.init(directory, null, false, readOnly, codecs); } // Make sure reopenSegment did not carry over a @@ -218,9 +218,9 @@ // Return a new [ReadOnly]SegmentReader instead return SegmentReader.get(openReadOnly, infos, infos.info(0), false); } else if (openReadOnly) { - return new ReadOnlyMultiSegmentReader(directory, infos, closeDirectory, subReaders, starts, normsCache, doClone); + return new ReadOnlyMultiSegmentReader(directory, infos, closeDirectory, subReaders, starts, normsCache, doClone, codecs); } else { - return new MultiSegmentReader(directory, infos, closeDirectory, subReaders, starts, normsCache, false, doClone); + return new MultiSegmentReader(directory, infos, closeDirectory, subReaders, starts, normsCache, false, doClone, codecs); } } @@ -401,6 +401,10 @@ return new MultiTermDocs(subReaders, starts); } + public FieldsEnum fields() throws IOException { + return new MultiFieldsEnum(subReaders, starts); + } + public TermPositions termPositions() throws IOException { ensureOpen(); return new MultiTermPositions(subReaders, starts); @@ -664,4 +668,266 @@ return ((TermPositions) current).isPayloadAvailable(); } } + + static class FieldPQData { + final FieldsEnum fields; + TermsEnum terms; + final int start; + final IndexReader reader; + + FieldPQData(FieldsEnum fields, int start, IndexReader reader) { + this.fields = fields; + this.start = start; + this.reader = reader; + } + + void close() throws IOException { + //fields.close(); + } + } + + private static class MultiFieldsEnum extends FieldsEnum { + private String currentField; + private final IndexReader[] readers; + private final int[] starts; + + private final FieldPQData[] fields; + + private final FieldPQData[] sameFields; + int numSameFields; + + MultiFieldsEnum(IndexReader[] readers, int[] starts) throws IOException { + this.readers = readers; + this.starts = starts; + fields = new FieldPQData[readers.length]; + for(int i=0;i 0) { + FieldPQData top = (FieldPQData) queue.top(); + currentField = top.fields.field(); + return true; + } else { + return false; + } + } + */ + + public boolean next() { + // nocommit -- todo + throw new UnsupportedOperationException(); + } + + public boolean seek(String field) throws IOException { + boolean result; + currentField = field; + numSameFields = 0; + for(int i=0;i 0; + } + + private final MultiTermsEnum terms = new MultiTermsEnum(); + + public TermsEnum terms() throws IOException { + terms.reset(sameFields, numSameFields); + return terms; + } + + public void close() { + }; + } + + private static class MultiTermsEnum extends TermsEnum { + + FieldPQData[] fields; + int numFields; + + FieldPQData[] sameText; + int numSame; + + void reset(FieldPQData[] fields, int numFields) throws IOException { + this.fields = fields; + this.numFields = numFields; + for(int i=0;i 0; + } + + public String text() { + throw new UnsupportedOperationException(); + } + + public boolean next() { + // nocommit todo + throw new UnsupportedOperationException(); + } + + public void close() { + } + + public int docFreq() { + int sum = 0; + for(int i=0;i pairs in a - Directory. A TermInfos can be written once, in order. */ - + Directory. A TermInfos can be written once, in order. + * + * @deprecated This class has been replaced by + * FormatPostingsTermsDictWriter. */ final class TermInfosWriter { /** The file format version, a negative number. */ public static final int FORMAT = -3; @@ -36,193 +38,4 @@ // NOTE: always change this if you switch to a new format! public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); - add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - byte[] newArray = new byte[(int) (termBytesLength*1.5)]; - System.arraycopy(lastTermBytes, 0, newArray, 0, start); - lastTermBytes = newArray; - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - } Index: src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (revision 747337) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (working copy) @@ -1,32 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.IndexInput; - -abstract class FormatPostingsPositionsConsumer { - - /** Add a new position & payload. If payloadLength > 0 - * you must read those bytes from the IndexInput. */ - abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; - - /** Called when we are done adding positions & payloads */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/PositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) @@ -0,0 +1,43 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +abstract class PositionsConsumer { + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Add a new position & payload. If payloadLength > 0 + * you must read those bytes from the IndexInput. NOTE: + * you must fully consume the byte[] payload, since + * caller is free to reuse it on subsequent calls. */ + abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; + + /** Called when we are done adding positions & payloads + * for each doc */ + abstract void finishDoc() throws IOException; + + abstract void finishTerm(boolean isIndexTerm) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/PositionsConsumer.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 747337) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -577,7 +577,7 @@ */ public IndexWriter(String path, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { - init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null); + init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null, null); } /** @@ -606,7 +606,7 @@ */ public IndexWriter(String path, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { - init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); + init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null, null); } /** @@ -637,7 +637,7 @@ */ public IndexWriter(File path, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { - init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null); + init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null, null); } /** @@ -666,7 +666,7 @@ */ public IndexWriter(File path, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { - init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); + init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null, null); } /** @@ -697,7 +697,7 @@ */ public IndexWriter(Directory d, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, null, false, mfl.getLimit(), null, null); + init(d, a, create, false, null, false, mfl.getLimit(), null, null, null); } /** @@ -725,7 +725,7 @@ */ public IndexWriter(Directory d, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); + init(d, a, create, false, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null, null); } /** @@ -935,7 +935,7 @@ */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, null, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); + init(d, a, create, false, null, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null, null); } /** @@ -1022,7 +1022,7 @@ */ public IndexWriter(Directory d, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), null, null); + init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), null, null, null); } /** @@ -1057,9 +1057,9 @@ * false or if there is any other low-level * IO error */ - IndexWriter(Directory d, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl, IndexingChain indexingChain, IndexCommit commit) + IndexWriter(Directory d, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl, IndexingChain indexingChain, IndexCommit commit, PostingsCodecs codecs) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), indexingChain, commit); + init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), indexingChain, commit, codecs); } /** @@ -1092,7 +1092,7 @@ */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, create, false, deletionPolicy, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); + init(d, a, create, false, deletionPolicy, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null, null); } /** @@ -1133,24 +1133,30 @@ */ public IndexWriter(Directory d, Analyzer a, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { - init(d, a, false, false, deletionPolicy, false, mfl.getLimit(), null, commit); + init(d, a, false, false, deletionPolicy, false, mfl.getLimit(), null, commit, null); } private void init(Directory d, Analyzer a, boolean closeDir, IndexDeletionPolicy deletionPolicy, boolean autoCommit, int maxFieldLength, IndexingChain indexingChain, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { if (IndexReader.indexExists(d)) { - init(d, a, false, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit); + init(d, a, false, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit, null); } else { - init(d, a, true, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit); + init(d, a, true, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit, null); } } + PostingsCodecs codecs; + private void init(Directory d, Analyzer a, final boolean create, boolean closeDir, IndexDeletionPolicy deletionPolicy, boolean autoCommit, int maxFieldLength, - IndexingChain indexingChain, IndexCommit commit) + IndexingChain indexingChain, IndexCommit commit, PostingsCodecs codecsIn) throws CorruptIndexException, LockObtainFailedException, IOException { this.closeDir = closeDir; + if (codecs == null) + codecs = PostingsCodecs.getDefault(); + else + codecs = codecsIn; directory = d; analyzer = a; setMessageID(defaultInfoStream); @@ -1176,14 +1182,14 @@ // searching. In this case we write the next // segments_N file with no segments: try { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); segmentInfos.clear(); } catch (IOException e) { // Likely this means it's a fresh directory } segmentInfos.commit(directory); } else { - segmentInfos.read(directory); + segmentInfos.read(directory, codecs); if (commit != null) { // Swap out all segments, but, keep metadata in @@ -1194,7 +1200,7 @@ if (commit.getDirectory() != directory) throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory"); SegmentInfos oldInfos = new SegmentInfos(); - oldInfos.read(directory, commit.getSegmentsFileName()); + oldInfos.read(directory, commit.getSegmentsFileName(), codecs); segmentInfos.replace(oldInfos); changeCount++; if (infoStream != null) @@ -1217,7 +1223,7 @@ // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, - segmentInfos, infoStream, docWriter); + segmentInfos, infoStream, docWriter, this.codecs); if (deleter.startingCommitDeleted) // Deletion policy deleted the "head" commit point. @@ -3086,7 +3092,7 @@ ensureOpen(); for (int i = 0; i < dirs.length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); + sis.read(dirs[i], codecs); for (int j = 0; j < sis.size(); j++) { final SegmentInfo info = sis.info(j); docCount += info.docCount; @@ -3215,7 +3221,7 @@ } SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); + sis.read(dirs[i], codecs); for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); assert !segmentInfos.contains(info): "dup info dir=" + info.dir + " name=" + info.name; @@ -3404,9 +3410,11 @@ // lock: startTransaction(true); + success = false; + try { mergedName = newSegmentName(); - merger = new SegmentMerger(this, mergedName, null); + merger = new SegmentMerger(this, mergedName, null, codecs); IndexReader sReader = null; synchronized(this) { @@ -3432,7 +3440,7 @@ synchronized(this) { segmentInfos.clear(); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false, merger.hasProx()); + -1, null, false, merger.hasProx(), merger.getCodec()); segmentInfos.add(info); } @@ -3478,7 +3486,7 @@ startTransaction(false); try { - merger.createCompoundFile(mergedName + ".cfs"); + merger.createCompoundFile(mergedName + ".cfs", info); synchronized(this) { info.setUseCompoundFile(true); } @@ -3828,7 +3836,8 @@ directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - docWriter.hasProx()); + docWriter.hasProx(), + docWriter.getCodec()); } docWriter.pushDeletes(); @@ -4367,7 +4376,8 @@ docStoreOffset, docStoreSegment, docStoreIsCompoundFile, - false); + false, + null); // Also enroll the merged segment into mergingSegments; // this prevents it from getting selected for a merge @@ -4456,7 +4466,7 @@ if (infoStream != null) message("merging " + merge.segString(directory)); - merger = new SegmentMerger(this, mergedName, merge); + merger = new SegmentMerger(this, mergedName, merge, codecs); boolean success = false; @@ -4467,6 +4477,8 @@ for (int i = 0; i < numSegments; i++) { SegmentInfo si = sourceSegmentsClone.info(i); + // nocommit + assert si.getCodec() != null; IndexReader reader = SegmentReader.get(true, si, MERGE_READ_BUFFER_SIZE, merge.mergeDocStores); // no need to set deleter (yet) merger.add(reader); totDocCount += reader.numDocs(); @@ -4480,6 +4492,9 @@ // This is where all the work happens: mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores); + // Record which codec was used to write the segment + merge.info.setCodec(merger.getCodec()); + assert mergedDocCount == totDocCount; success = true; @@ -4512,7 +4527,7 @@ final String compoundFileName = mergedName + "." + IndexFileNames.COMPOUND_FILE_EXTENSION; try { - merger.createCompoundFile(compoundFileName); + merger.createCompoundFile(compoundFileName, merge.info); success = true; } catch (IOException ioe) { synchronized(this) { Index: src/java/org/apache/lucene/index/PostingsCodec.java =================================================================== --- src/java/org/apache/lucene/index/PostingsCodec.java (revision 0) +++ src/java/org/apache/lucene/index/PostingsCodec.java (revision 0) @@ -0,0 +1,67 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.io.IOException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +abstract class PostingsCodec { + + static boolean DEBUG = false; + + static final int CODEC_HEADER = 0x1af65; + + /** Unique name that's used to retrieve this codec when + * reading the index */ + String name; + + /** Writes a new segment */ + abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; + + /** Reads a segment */ + abstract FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize) throws IOException; + + /** Gathers files associated with this segment */ + abstract void files(SegmentInfo segmentInfo, Collection files); + + static void checkHeader(IndexInput in, String codec, int version) throws IOException { + + // Safety to guard against reading a bogus string: + int header = in.readVInt(); + if (header != CODEC_HEADER) + throw new CorruptIndexException("codec header mismatch"); + + final String actualCodec = in.readString(); + if (!codec.equals(actualCodec)) + throw new CorruptIndexException("codec mismatch: expected '" + codec + "' but got '" + actualCodec + "'"); + + int actualVersion = in.readVInt(); + if (actualVersion > version) + throw new CorruptIndexException("version '" + actualVersion + "' is too new (expected <= '" + version + "'"); + } + + static void writeHeader(IndexOutput out, String codec, int version) throws IOException { + out.writeVInt(CODEC_HEADER); + out.writeString(codec); + out.writeVInt(version); + } +} Property changes on: src/java/org/apache/lucene/index/PostingsCodec.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexFileDeleter.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileDeleter.java (revision 747337) +++ src/java/org/apache/lucene/index/IndexFileDeleter.java (working copy) @@ -129,7 +129,8 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter) + public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter, + PostingsCodecs codecs) throws CorruptIndexException, IOException { this.docWriter = docWriter; @@ -170,7 +171,7 @@ } SegmentInfos sis = new SegmentInfos(); try { - sis.read(directory, fileName); + sis.read(directory, fileName, codecs); } catch (FileNotFoundException e) { // LUCENE-948: on NFS (and maybe others), if // you have writers switching back and forth @@ -207,7 +208,7 @@ // try now to explicitly open this commit point: SegmentInfos sis = new SegmentInfos(); try { - sis.read(directory, segmentInfos.getCurrentSegmentFileName()); + sis.read(directory, segmentInfos.getCurrentSegmentFileName(), codecs); } catch (IOException e) { throw new CorruptIndexException("failed to locate current segments_N file"); } Index: src/java/org/apache/lucene/index/PositionsEnum.java =================================================================== --- src/java/org/apache/lucene/index/PositionsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/PositionsEnum.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.AttributeSource; + +public abstract class PositionsEnum extends AttributeSource { + + /** Returns the next position. You should only call this + * up to {@link FormatPostingsDocsEnum#freq()} times else + * the behavior is not defined. */ + abstract int next() throws IOException; + + abstract int getPayloadLength(); + + // nocommit -- improve this so that readers that do their + // own buffering can save a copy + abstract byte[] getPayload(byte[] data, int offset) throws IOException; + + abstract boolean hasPayload(); +} Index: src/java/org/apache/lucene/index/SegmentWriteState.java =================================================================== --- src/java/org/apache/lucene/index/SegmentWriteState.java (revision 747337) +++ src/java/org/apache/lucene/index/SegmentWriteState.java (working copy) @@ -26,21 +26,47 @@ DocumentsWriter docWriter; Directory directory; String segmentName; + FieldInfos fieldInfos; String docStoreSegmentName; int numDocs; - int termIndexInterval; int numDocsInStore; Collection flushedFiles; - public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs, - int numDocsInStore, int termIndexInterval) { + // Actual codec used + PostingsCodec codec; + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int termIndexInterval; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, FieldInfos fieldInfos, + String docStoreSegmentName, int numDocs, + int numDocsInStore, int termIndexInterval, + PostingsCodecs codecs) { this.docWriter = docWriter; this.directory = directory; this.segmentName = segmentName; + this.fieldInfos = fieldInfos; this.docStoreSegmentName = docStoreSegmentName; this.numDocs = numDocs; this.numDocsInStore = numDocsInStore; this.termIndexInterval = termIndexInterval; + this.codec = codecs.getWriter(this); flushedFiles = new HashSet(); } Index: src/java/org/apache/lucene/index/TermDocs.java =================================================================== --- src/java/org/apache/lucene/index/TermDocs.java (revision 747337) +++ src/java/org/apache/lucene/index/TermDocs.java (working copy) @@ -26,7 +26,8 @@ ordered by document number. @see IndexReader#termDocs() - */ + @deprecated Use DocsEnum instead +*/ public interface TermDocs { /** Sets this to the data for a term. Index: src/java/org/apache/lucene/index/SepSkipListReader.java =================================================================== --- src/java/org/apache/lucene/index/SepSkipListReader.java (revision 0) +++ src/java/org/apache/lucene/index/SepSkipListReader.java (revision 0) @@ -0,0 +1,140 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.IndexInput; + +/** + * Implements the skip list reader for the default posting list format + * that stores positions and payloads. + * + */ +class SepSkipListReader extends MultiLevelSkipListReader { + private boolean currentFieldStoresPayloads; + private long freqPointer[]; + private long docPointer[]; + private long posPointer[]; + private long payloadPointer[]; + private int payloadLength[]; + + private long lastFreqPointer; + private long lastDocPointer; + private long lastPosPointer; + private long lastPayloadPointer; + private int lastPayloadLength; + + + SepSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { + super(skipStream, maxSkipLevels, skipInterval); + freqPointer = new long[maxSkipLevels]; + docPointer = new long[maxSkipLevels]; + posPointer = new long[maxSkipLevels]; + payloadPointer = new long[maxSkipLevels]; + payloadLength = new int[maxSkipLevels]; + } + + void init(long skipPointer, long docBasePointer, long freqBasePointer, long posBasePointer, long payloadBasePointer, int df, boolean storesPayloads) { + super.init(skipPointer, df); + this.currentFieldStoresPayloads = storesPayloads; + lastFreqPointer = freqBasePointer; + lastDocPointer = docBasePointer; + lastPosPointer = posBasePointer; + lastPayloadPointer = payloadBasePointer; + + Arrays.fill(docPointer, docBasePointer); + Arrays.fill(freqPointer, freqBasePointer); + Arrays.fill(posPointer, posBasePointer); + Arrays.fill(payloadPointer, payloadBasePointer); + Arrays.fill(payloadLength, 0); + } + + /** Returns the freq pointer of the doc to which the last call of + * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ + long getFreqPointer() { + return lastFreqPointer; + } + + long getDocPointer() { + return lastDocPointer; + } + + /** Returns the prox pointer of the doc to which the last call of + * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ + long getPosPointer() { + return lastPosPointer; + } + + long getPayloadPointer() { + return lastPayloadPointer; + } + + /** Returns the payload length of the payload stored just before + * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} + * has skipped. */ + int getPayloadLength() { + return lastPayloadLength; + } + + protected void seekChild(int level) throws IOException { + super.seekChild(level); + freqPointer[level] = lastFreqPointer; + docPointer[level] = lastDocPointer; + posPointer[level] = lastPosPointer; + payloadPointer[level] = lastPayloadPointer; + payloadLength[level] = lastPayloadLength; + } + + protected void setLastSkipData(int level) { + super.setLastSkipData(level); + lastFreqPointer = freqPointer[level]; + lastDocPointer = docPointer[level]; + lastPosPointer = posPointer[level]; + lastPayloadPointer = payloadPointer[level]; + lastPayloadLength = payloadLength[level]; + } + + + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta; + //System.out.println(" readSkipData skipFP=" + skipStream.getFilePointer() + " storesPayloads=" + currentFieldStoresPayloads); + if (currentFieldStoresPayloads) { + // the current field stores payloads. + // if the doc delta is odd then we have + // to read the current payload length + // because it differs from the length of the + // previous payload + delta = skipStream.readVInt(); + if ((delta & 1) != 0) { + payloadLength[level] = skipStream.readVInt(); + } + delta >>>= 1; + } else { + delta = skipStream.readVInt(); + } + //System.out.println(" delta=" + delta + " level=" + level); + freqPointer[level] += skipStream.readVInt(); + docPointer[level] += skipStream.readVInt(); + posPointer[level] += skipStream.readVInt(); + payloadPointer[level] += skipStream.readVInt(); + + return delta; + } +} Property changes on: src/java/org/apache/lucene/index/SepSkipListReader.java ___________________________________________________________________ Added: svn:eol-style + native Index: src/java/org/apache/lucene/index/ReadOnlyMultiSegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/ReadOnlyMultiSegmentReader.java (revision 747337) +++ src/java/org/apache/lucene/index/ReadOnlyMultiSegmentReader.java (working copy) @@ -23,12 +23,15 @@ import java.util.Map; class ReadOnlyMultiSegmentReader extends MultiSegmentReader { - ReadOnlyMultiSegmentReader(Directory directory, SegmentInfos sis, boolean closeDirectory) throws IOException { - super(directory, sis, closeDirectory, true); + ReadOnlyMultiSegmentReader(Directory directory, SegmentInfos sis, boolean closeDirectory, PostingsCodecs codecs) throws IOException { + super(directory, sis, closeDirectory, true, codecs); } - ReadOnlyMultiSegmentReader(Directory directory, SegmentInfos infos, boolean closeDirectory, SegmentReader[] oldReaders, int[] oldStarts, Map oldNormsCache, boolean doClone) throws IOException { - super(directory, infos, closeDirectory, oldReaders, oldStarts, oldNormsCache, true, doClone); + ReadOnlyMultiSegmentReader(Directory directory, SegmentInfos infos, + boolean closeDirectory, SegmentReader[] oldReaders, + int[] oldStarts, Map oldNormsCache, boolean doClone, + PostingsCodecs codecs) throws IOException { + super(directory, infos, closeDirectory, oldReaders, oldStarts, oldNormsCache, true, doClone, codecs); } protected void acquireWriteLock() { Index: src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- src/java/org/apache/lucene/util/ArrayUtil.java (revision 747337) +++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -110,6 +110,29 @@ return array; } + public static char[] grow(char[] array, int minSize) { + if (array.length < minSize) { + char[] newArray = new char[getNextSize(minSize)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static char[] grow(char[] array) { + return grow(array, 1+array.length); + } + + public static char[] shrink(char[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize); + if (newSize != array.length) { + char[] newArray = new char[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + /** Returns hash of chars in range start (inclusive) to * end (inclusive) */ public static int hashCode(char[] array, int start, int end) { Index: src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- src/java/org/apache/lucene/util/UnicodeUtil.java (revision 747337) +++ src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -77,11 +77,8 @@ public int length; public void setLength(int newLength) { - if (result.length < newLength) { - byte[] newArray = new byte[(int) (1.5*newLength)]; - System.arraycopy(result, 0, newArray, 0, length); - result = newArray; - } + if (result.length < newLength) + result = ArrayUtil.grow(result, newLength); length = newLength; } } @@ -92,11 +89,8 @@ public int length; public void setLength(int newLength) { - if (result.length < newLength) { - char[] newArray = new char[(int) (1.5*newLength)]; - System.arraycopy(result, 0, newArray, 0, length); - result = newArray; - } + if (result.length < newLength) + result = ArrayUtil.grow(result, newLength); length = newLength; } @@ -104,6 +98,13 @@ setLength(other.length); System.arraycopy(other.result, 0, result, 0, length); } + + public void copyText(String other) { + final int otherLength = other.length(); + setLength(otherLength); + other.getChars(0, otherLength, result, 0); + length = otherLength; + } } /** Encode characters from a char[] source, starting at Index: contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (revision 747337) +++ contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (working copy) @@ -203,7 +203,7 @@ private Analyzer analyzer; private boolean fastMode = false; - private final boolean verbose = false; + private final boolean verbose = true; private static final String FIELD_NAME = "content"; @@ -333,7 +333,7 @@ if (useMemIndex && useRAMIndex) { if (verbose) System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2); if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) { - throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); + throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer + " score1=" + score1 + " score2=" + score2); } } } @@ -424,9 +424,11 @@ else searcher = ((MemoryIndex) index).createSearcher(); + System.out.println("now search"); final float[] scores = new float[1]; // inits to 0.0f searcher.search(query, new HitCollector() { public void collect(int doc, float score) { + System.out.println(" collect score=" + score); scores[0] = score; } });