Index: src/test/org/apache/lucene/store/MockRAMInputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 718730) +++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy) @@ -1,7 +1,5 @@ package org.apache.lucene.store; -import java.io.IOException; - /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,6 +17,8 @@ * limitations under the License. */ +import java.io.IOException; + /** * Used by MockRAMDirectory to create an input stream that * keeps track of when it's been closed. @@ -44,16 +44,8 @@ // all clones get closed: if (!isClone) { synchronized(dir.openFiles) { - Integer v = (Integer) dir.openFiles.get(name); - // Could be null when MockRAMDirectory.crash() was called - if (v != null) { - if (v.intValue() == 1) { - dir.openFiles.remove(name); - } else { - v = new Integer(v.intValue()-1); - dir.openFiles.put(name, v); - } - } + assert dir.openFiles.containsKey(this): "input=" + name + " is not open"; + dir.openFiles.remove(this); } } } Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 718730) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -208,9 +208,11 @@ if (crashed) throw new IOException("cannot createOutput after crash"); init(); - synchronized(openFiles) { + synchronized(this) { if (preventDoubleWrite && createdFiles.contains(name) && !name.equals("segments.gen")) throw new IOException("file \"" + name + "\" was already written to"); + } + synchronized(openFiles) { if (noDeleteOpenFile && openFiles.containsKey(name)) throw new IOException("MockRAMDirectory: file \"" + name + "\" is still open: cannot overwrite"); } @@ -237,6 +239,15 @@ return new MockRAMOutputStream(this, file); } + static class OpenFile { + final String name; + final Throwable stack; + OpenFile(String name) { + this.name = name; + this.stack = new Throwable(); + } + } + public IndexInput openInput(String name) throws IOException { RAMFile file; synchronized (this) { @@ -245,17 +256,12 @@ if (file == null) throw new FileNotFoundException(name); else { + IndexInput in = new MockRAMInputStream(this, name, file); synchronized(openFiles) { - if (openFiles.containsKey(name)) { - Integer v = (Integer) openFiles.get(name); - v = new Integer(v.intValue()+1); - openFiles.put(name, v); - } else { - openFiles.put(name, new Integer(1)); - } + openFiles.put(in, new OpenFile(name)); } + return in; } - return new MockRAMInputStream(this, name, file); } /** Provided for testing purposes. Use sizeInBytes() instead. */ @@ -289,7 +295,14 @@ if (noDeleteOpenFile && openFiles.size() > 0) { // RuntimeException instead of IOException because // super() does not throw IOException currently: - throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles); + Iterator it = openFiles.values().iterator(); + System.out.println("\nMockRAMDirectory open files:"); + while(it.hasNext()) { + OpenFile openFile = (OpenFile) it.next(); + System.out.println("\nfile " + openFile.name + " opened from:\n"); + openFile.stack.printStackTrace(System.out); + } + throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files"); } } } Index: src/test/org/apache/lucene/index/TestSegmentTermEnum.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermEnum.java (revision 718730) +++ src/test/org/apache/lucene/index/TestSegmentTermEnum.java (working copy) @@ -65,23 +65,6 @@ verifyDocFreq(); } - public void testPrevTermAtEnd() throws IOException - { - Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - addDoc(writer, "aaa bbb"); - writer.close(); - IndexReader reader = IndexReader.open(dir); - SegmentTermEnum termEnum = (SegmentTermEnum) reader.terms(); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.term().text()); - assertTrue(termEnum.next()); - assertEquals("aaa", termEnum.prev().text()); - assertEquals("bbb", termEnum.term().text()); - assertFalse(termEnum.next()); - assertEquals("bbb", termEnum.prev().text()); - } - private void verifyDocFreq() throws IOException { Index: src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexReader.java (revision 718730) +++ src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -870,15 +870,18 @@ d.add(new Field("id", Integer.toString(i), Field.Store.YES, Field.Index.NOT_ANALYZED)); d.add(new Field("content", "aaa " + i, Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(d); + if (0==i%10) + writer.commit(); } writer.close(); - long diskUsage = startDir.sizeInBytes(); - long diskFree = diskUsage+100; + long diskUsage = ((MockRAMDirectory) startDir).getRecomputedActualSizeInBytes(); + long diskFree = diskUsage+100; IOException err = null; boolean done = false; + boolean gotExc = false; // Iterate w/ ever increasing free disk space: while(!done) { @@ -935,7 +938,7 @@ int docId = 12; for(int i=0;i<13;i++) { reader.deleteDocument(docId); - reader.setNorm(docId, "contents", (float) 2.0); + reader.setNorm(docId, "content", (float) 2.0); docId += 12; } } @@ -950,6 +953,7 @@ e.printStackTrace(System.out); } err = e; + gotExc = true; if (1 == x) { e.printStackTrace(); fail(testName + " hit IOException after disk space was freed up"); @@ -1039,6 +1043,8 @@ newReader.close(); if (result2 == END_COUNT) { + if (!gotExc) + fail("never hit disk full"); break; } } Index: src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 718730) +++ src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -291,12 +291,12 @@ if (!termEnum2.next()) break; } + assertEquals(len1, len2); + if (len1==0) break; // no more terms + if (!hasDeletes) assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); - assertEquals(len1, len2); - if (len1==0) break; // no more terms - assertEquals(term1, term2); // sort info2 to get it into ascending docid Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 718730) +++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -56,14 +56,14 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); - if (segTermDocs.next() == true) + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field")); + if (termDocs.next() == true) { - int docId = segTermDocs.doc(); + int docId = termDocs.doc(); assertTrue(docId == 0); - int freq = segTermDocs.freq(); + int freq = termDocs.freq(); assertTrue(freq == 3); } reader.close(); @@ -79,10 +79,10 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("textField2", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("textField2", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } { @@ -90,10 +90,10 @@ SegmentReader reader = SegmentReader.get(info); reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); - SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); - assertTrue(segTermDocs != null); - segTermDocs.seek(new Term("junk", "bad")); - assertTrue(segTermDocs.next() == false); + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term("junk", "bad")); + assertTrue(termDocs.next() == false); reader.close(); } } Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 718730) +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy) @@ -32,7 +32,8 @@ import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.store.Directory; /** * This testcase tests whether multi-level skipping is being used @@ -44,7 +45,7 @@ */ public class TestMultiLevelSkipList extends LuceneTestCase { public void testSimpleSkip() throws IOException { - RAMDirectory dir = new RAMDirectory(); + Directory dir = new CountingRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Term term = new Term("test", "a"); @@ -58,8 +59,7 @@ writer.close(); IndexReader reader = IndexReader.open(dir); - SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(); - tp.freqStream = new CountingStream(tp.freqStream); + TermPositions tp = reader.termPositions(); for (int i = 0; i < 2; i++) { counter = 0; @@ -114,6 +114,15 @@ } + class CountingRAMDirectory extends MockRAMDirectory { + public IndexInput openInput(String fileName) throws IOException { + IndexInput in = super.openInput(fileName); + if (fileName.endsWith(".frq")) + in = new CountingStream(in); + return in; + } + } + private int counter = 0; // Simply extends IndexInput in a way that we are able to count the number Index: src/test/org/apache/lucene/index/TestFormatPostings.java =================================================================== --- src/test/org/apache/lucene/index/TestFormatPostings.java (revision 0) +++ src/test/org/apache/lucene/index/TestFormatPostings.java (revision 0) @@ -0,0 +1,438 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.*; +import org.apache.lucene.store.*; +import java.util.*; + +// TODO +// - test w/ different indexDivisor +// - test field where payload length rarely changes +// - 0-term fields +// - seek/skip to same term/doc i'm already on +// - mix in deleted docs +// - seek, skip beyond end -- assert returns false +// - seek, skip to things that don't exist -- ensure it +// goes to 1 before next one known to exist +// - skipTo(term) +// - skipTo(doc) + +public class TestFormatPostings extends LuceneTestCase { + + private boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + private static final Random RANDOM = new Random(42); + private static String[] fieldNames = new String[] {"one", "two", "three", "four"}; + + private final static int NUM_TEST_ITER = 4000; + private final static int NUM_TEST_THREADS = 3; + private final static int NUM_FIELDS = 4; + private final static int NUM_TERMS_RAND = 100; + private final static int DOC_FREQ_RAND = 10; + private final static int TERM_DOC_FREQ_RAND = 20; + + // start is inclusive and end is exclusive + public int nextInt(int start, int end) { + return start + RANDOM.nextInt(end-start); + } + + private int nextInt(int lim) { + return RANDOM.nextInt(lim); + } + + private boolean nextBoolean() { + return 0 == nextInt(1); + } + + char[] getRandomText() { + + final int len = 1+nextInt(10); + char[] buffer = new char[len+1]; + for(int i=0;i=0;i--) { + if (DEBUG) + System.out.println(" TEST: term=" + field.terms[i].text2 + " has docFreq=" + field.terms[i].docs.length); + assertTrue(termsEnum.seek(field.terms[i].text2)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + } + + // Seek to non-existent empty-string term + assertFalse(termsEnum.seek("")); + + // Make sure we're now pointing to first term + assertEquals(termsEnum.text(), field.terms[0].text2); + + // Test docs enum + if (DEBUG) + System.out.println("\nTEST: docs/positions"); + termsEnum.seek(""); + upto = 0; + do { + if (nextInt(3) == 1) { + term = field.terms[upto]; + if (DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: iterate docs..."); + DocsEnum docs = termsEnum.docs(null); + int upto2 = -1; + while(upto2 < term.docs.length-1) { + // Maybe skip: + final int left = term.docs.length-upto2; + if (nextInt(3) == 1 && left >= 1) { + int inc = 1+nextInt(left-1); + upto2 += inc; + if (DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: dr.skip: " + left + " docs left; skip to doc=" + term.docs[upto2] + " [" + upto2 + " of " + term.docs.length + "]"); + assertTrue(docs.skip(term.docs[upto2])); + } else { + assertTrue(docs.next()); + upto2++; + } + if (DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: got next doc..."); + assertEquals(term.docs[upto2], docs.doc()); + if (!field.omitTF) { + assertEquals(term.positions[upto2].length, docs.freq()); + if (nextInt(2) == 1) { + if (DEBUG) + System.out.println("TEST [" + getDesc(field, term, term.docs[upto2]) + "]: check positions for doc " + term.docs[upto2] + "..."); + verifyPositions(term.positions[upto2], docs.positions()); + } else if (DEBUG) + System.out.println("TEST: skip positions..."); + } else if (DEBUG) + System.out.println("TEST: skip positions: omitTF=true"); + } + + assertFalse(docs.next()); + + } else if (DEBUG) + System.out.println("TEST [" + getDesc(field, term) + "]: skip docs"); + upto++; + + } while (termsEnum.next()); + + assertEquals(upto, field.terms.length); + + termsEnum.close(); + } + } + } + + private void write(FieldInfos fieldInfos, Directory dir, RandomField[] fields) throws Throwable { + + // nocommit -- randomize this: + final int termIndexInterval = 16; + + SegmentWriteState state = new SegmentWriteState(null, dir, SEGMENT, fieldInfos, null, 10000, 10000, termIndexInterval); + + final FieldsConsumer consumer = new FormatPostingsTermsDictWriter(state, + new FormatPulsingDocsWriter(state, + 1, + new FormatPostingsDocsWriter(state))); + Arrays.sort(fields); + for(int i=0;i= 0); } Index: src/test/org/apache/lucene/TestSearchForDuplicates.java =================================================================== --- src/test/org/apache/lucene/TestSearchForDuplicates.java (revision 718730) +++ src/test/org/apache/lucene/TestSearchForDuplicates.java (working copy) @@ -94,6 +94,9 @@ for (int j = 0; j < MAX_DOCS; j++) { Document d = new Document(); d.add(new Field(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES, Field.Index.ANALYZED)); + + // NOTE: this ID_FIELD produces no tokens since + // SimpleAnalyzer discards numbers d.add(new Field(ID_FIELD, Integer.toString(j), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d); } Index: src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsDocsConsumer { - - /** Adds a new doc in this term. If this returns null - * then we just skip consuming positions/payloads. */ - abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; - - /** Called when we are done adding docs to this term */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/DocsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/DocsConsumer.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class DocsConsumer { + + // nocommit + String desc; + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Adds a new doc in this term. Return null if this + * consumer doesn't need to see the positions for this + * doc. */ + abstract PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException; + + abstract void setField(FieldInfo fieldInfo); + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/DocsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsDocsReader.java (revision 0) @@ -0,0 +1,427 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BitVector; + +/** Concrete class that reads the current doc/freq/skip + * postings format */ + +// nocommit -- should we switch "hasProx" higher up? and +// create two separate docs readers, one that also reads +// prox and one that doesn't? + +class FormatPostingsDocsReader extends FormatPostingsTermsDictDocsReader { + + final IndexInput freqIn; + IndexInput termsIn; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + private final FormatPostingsPositionsReader posReader; + + int skipInterval; + int maxSkipLevels; + + FormatPostingsDocsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.FREQ_EXTENSION), readBufferSize); + + boolean success = false; + try { + if (segmentInfo.getHasProx()) + posReader = new FormatPostingsPositionsReader(dir, segmentInfo, readBufferSize); + else + posReader = null; + success = true; + } finally { + if (!success) + freqIn.close(); + } + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + + // Make sure we are talking to the matching past writer + // nocommit -- refactor all of these to + // "checkCodecVersion" static method somewhere: + + // nocommit -- guard against attempting to read / + // allocate absurdly massive string + String codec = termsIn.readString(); + if (!codec.equals(FormatPostingsDocsWriter.CODEC)) + throw new CorruptIndexException("codec mismatch: expected '" + FormatPostingsDocsWriter.CODEC + "' but got '" + codec + "'"); + + int version = termsIn.readInt(); + if (version != FormatPostingsDocsWriter.VERSION_START) + throw new CorruptIndexException("version mismatch: expected " + FormatPostingsDocsWriter.VERSION_START + " but got " + version); + + skipInterval = termsIn.readInt(); + maxSkipLevels = termsIn.readInt(); + if (posReader != null) + posReader.start(termsIn); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + + final FormatPostingsPositionsReader.TermsDictReader posReader2; + if (posReader != null && !fieldInfo.omitTf) + posReader2 = (FormatPostingsPositionsReader.TermsDictReader) posReader.reader(fieldInfo, termsIn); + else + posReader2 = null; + + return new TermsDictReader(fieldInfo, posReader2, termsIn); + } + + void close() throws IOException { + try { + freqIn.close(); + } finally { + if (posReader != null) + posReader.close(); + } + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long freqOffset; + long skipOffset; + int docFreq; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + // TODO: abstraction violation (we are storing this with + // the concrete impl. as the type, not the abstract base + // class) + final FormatPostingsPositionsReader.TermsDictReader posReader; + private SegmentDocsEnum docs; + + TermsDictReader(FieldInfo fieldInfo, FormatPostingsPositionsReader.TermsDictReader posReader, IndexInput termsIn) { + this.termsIn = termsIn; // not cloned + this.fieldInfo = fieldInfo; + this.posReader = posReader; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + + this.docFreq = docFreq; + if (DEBUG) + System.out.println(" dr.readTerm termsInPointer=" + termsIn.getFilePointer() + " df=" + docFreq + " isIndex=" + isIndexTerm); + + if (isIndexTerm) + freqOffset = termsIn.readVLong(); + else + freqOffset += termsIn.readVLong(); + + if (DEBUG) + System.out.println(" freqOffset=" + freqOffset + " vs len=" + freqIn.length()); + + if (docFreq >= skipInterval) + skipOffset = termsIn.readVLong(); + else + skipOffset = 0; + + if (posReader != null) + posReader.readTerm(docFreq, isIndexTerm); + } + + public void close() throws IOException { + if (posReader != null) + posReader.close(); + } + + DocsEnum docs(BitVector deletedDocs) throws IOException { + + if (docs == null) + // Lazy init + docs = new SegmentDocsEnum(); + + docs.init(deletedDocs); + + return docs; + } + + class SegmentDocsEnum extends DocsEnum { + int docFreq; + int doc; + int count; + int freq; + long skipStart; + long freqStart; + final IndexInput freqIn; + // nocommit -- should we do omitTF with 2 different enum classes? + final boolean omitTF; + private BitVector deletedDocs; + + // nocommit -- should we do hasProx with 2 different enum classes? + + boolean skipped; + DefaultSkipListReader skipper; + + // TODO: abstraction violation: we are storing the + // concrete impl, not the abstract base class + FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum positions; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + SegmentDocsEnum() { + if (DEBUG) + System.out.println("new docs enum"); + this.freqIn = (IndexInput) FormatPostingsDocsReader.this.freqIn.clone(); + omitTF = fieldInfo.omitTf; + if (omitTF) + freq = 1; + } + + void close() { + } + + void init(BitVector deletedDocs) throws IOException { + if (DEBUG) + System.out.println("[" + desc + "] dr.init freqIn seek " + freqOffset + " this=" + this + " (in=" + freqIn + "; this=" + this + ")"); + this.deletedDocs = deletedDocs; + freqIn.seek(freqOffset); + this.docFreq = TermsDictReader.this.docFreq; + count = 0; + doc = 0; + skipped = false; + skipStart = freqStart + skipOffset; + proxSkipFreq = 0; + + // maybe not necessary? + proxSkipPayloadLength = -1; + + // TODO: abstraction violation + if (posReader != null) + proxOffset = posReader.proxOffset; + } + + boolean next() throws IOException { + if (DEBUG) + System.out.println("dr [" + desc + "] next count=" + count + " vs df=" + docFreq + " freq pointer=" + freqIn.getFilePointer() + " (in=" + freqIn + "; this=" + this + ") + has del docs=" + (deletedDocs != null) ); + + // new Throwable().printStackTrace(System.out); + + while(true) { + if (count == docFreq) + return false; + + count++; + + // Decode next doc/freq pair + final int code = freqIn.readVInt(); + if (DEBUG) + System.out.println(" read code=" + code); + if (omitTF) + doc += code; + else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) + break; + else if (DEBUG) + System.out.println(" doc=" + doc + " is deleted"); + } + + // nocommit + if (positions != null) + positions.desc = desc + ":" + doc; + + return true; + } + + int read(int[] docs, int[] freqs) throws IOException { + int i = 0; + final int length = docs.length; + while (i < length && count < docFreq) { + count++; + // manually inlined call to next() for speed + final int code = freqIn.readVInt(); + if (omitTF) { + doc += code; + freq = 1; + } else { + doc += code >>> 1; // shift off low bit + if ((code & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqIn.readVInt(); // else read freq + + if (positions != null) + positions.skip(freq); + else + proxSkipFreq += freq; + } + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + + return i; + } + + int doc() { + return doc; + } + + int ord() { + assert count > 0; + return count-1; + } + + int freq() { + return freq; + } + + long proxOffset; + int proxSkipPayloadLength = -1; + int proxSkipFreq; + PositionsEnum fakePositions; + + PositionsEnum positions() throws IOException { + if (positions == null) { + // Lazy init + if (posReader == null) { + // TermFreq was omitted from this field during + // indexing, which means we pretend termFreq is + // always 1 with that 1 occurrence having + // position 0 + if (fakePositions == null) + fakePositions = new FormatPostingsFakePositionsEnum(); + return fakePositions; + } else { + // TODO: abstraction violation + positions = (FormatPostingsPositionsReader.TermsDictReader.SegmentPositionsEnum) posReader.positions(); + if (DEBUG) + System.out.println("pos skip proxOffset=" + proxOffset + " payloadlen=" + proxSkipPayloadLength + " skipPosCount= " + proxSkipFreq); + positions.skip(proxOffset, proxSkipPayloadLength, proxSkipFreq); + } + } + + positions.desc = desc + ":" + doc; + + positions.catchUp(freq); + + return positions; + } + + boolean skip(int target) throws IOException { + + // TODO: jump right to next() if target is < X away + // from where we are now? + + if (DEBUG) + System.out.println("dr [" + desc + "]: skip to target=" + target); + + if (skipOffset > 0) { + + // There are enough docs in the posting to have + // skip data + if (skipper == null) + // Lazy init + skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval); + + if (!skipped) { + + // We haven't already skipped for this posting, + // so now we init the skipper + + // TODO: this is abstraction violation; instead, + // skipper should interact with this as a + // private consumer + skipper.init(freqOffset+skipStart, + freqOffset, proxOffset, + docFreq, fieldInfo.storePayloads); + + if (DEBUG) + System.out.println(" skip reader base freqFP=" + (freqOffset+skipStart) + " freqFP=" + freqOffset + " proxFP=" + proxOffset); + + skipped = true; + } + + final int newCount = skipper.skipTo(target); + + if (newCount > count) { + + if (DEBUG) + System.out.println("dr [" + desc + "]: skipper moved to newCount=" + newCount + " freqFP=" + skipper.getFreqPointer() + " proxFP=" + skipper.getProxPointer() + " doc=" + skipper.getDoc()); + + // Skipper did move + freqIn.seek(skipper.getFreqPointer()); + count = newCount; + doc = skipper.getDoc(); + + // TODO: abstraction violation; this should be a + // private interaction b/w skipper & posReader + if (positions != null) + // nocommit -- should that be count? + positions.skip(skipper.getProxPointer(), skipper.getPayloadLength(), 0); + else { + proxOffset = skipper.getProxPointer(); + proxSkipPayloadLength = skipper.getPayloadLength(); + // nocommit -- should that be count? + proxSkipFreq = 0; + } + } else if (DEBUG) + System.out.println(" no skipping to be done"); + } + + // Now, linear scan for the rest: + do { + if (!next()) + return false; + } while (target > doc); + + return true; + } + } + } +} + +/** Returned when someone asks for positions() enum on field + * with omitTf true */ +class FormatPostingsFakePositionsEnum extends PositionsEnum { + int next() { + return 0; + } + int getPayloadLength() { + return 0; + } + boolean hasPayload() { + return false; + } + byte[] getPayload(byte[] data, int offset) { + return null; + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatPostingsDocsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (working copy) @@ -25,36 +25,69 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.store.IndexOutput; -final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer { +final class FormatPostingsDocsWriter extends DocsConsumer { + final static String CODEC = "SingleFileDocFreqSkip"; + final static int VERSION_START = -1; + final static int VERSION_CURRENT = VERSION_START; final IndexOutput out; - final FormatPostingsTermsWriter parent; final FormatPostingsPositionsWriter posWriter; final DefaultSkipListWriter skipListWriter; final int skipInterval; + final int maxSkipLevels; final int totalNumDocs; + IndexOutput termsOut; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + boolean omitTF; boolean storePayloads; + // Starts a new term + long lastFreqStart; long freqStart; FieldInfo fieldInfo; - FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException { + FormatPostingsDocsWriter(SegmentWriteState state) throws IOException { super(); - this.parent = parent; - final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); + final String fileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.FREQ_EXTENSION); state.flushedFiles.add(fileName); - out = parent.parent.dir.createOutput(fileName); - totalNumDocs = parent.parent.totalNumDocs; + out = state.directory.createOutput(fileName); + totalNumDocs = state.numDocs; - // TODO: abstraction violation - skipInterval = parent.parent.termsOut.skipInterval; - skipListWriter = parent.parent.skipListWriter; - skipListWriter.setFreqOutput(out); + // nocommit -- abstraction violation + skipListWriter = new DefaultSkipListWriter(state.skipInterval, + state.maxSkipLevels, + state.numDocs, + out, + null); + skipInterval = state.skipInterval; + maxSkipLevels = state.maxSkipLevels; + posWriter = new FormatPostingsPositionsWriter(state, this); } + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + termsOut.writeString(CODEC); + termsOut.writeInt(VERSION_CURRENT); + termsOut.writeInt(skipInterval); // write skipInterval + termsOut.writeInt(maxSkipLevels); // write maxSkipLevels + posWriter.start(termsOut); + } + + void startTerm() { + freqStart = out.getFilePointer(); + if (!omitTF) + posWriter.startTerm(); + skipListWriter.resetSkip(); + } + + // nocommit -- should we NOT reuse across fields? would + // be cleaner + + // Currently, this instance is re-used across fields, so + // our parent calls setField whenever the field changes void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTF = fieldInfo.omitTf; @@ -65,11 +98,15 @@ int lastDocID; int df; + int count; + /** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ - FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; + if (DEBUG) + System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq + " freqPointer=" + out.getFilePointer()); if (docID < 0 || (df > 0 && delta <= 0)) throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); @@ -78,8 +115,12 @@ // TODO: abstraction violation skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); skipListWriter.bufferSkip(df); + if (DEBUG) + System.out.println(" bufferSkip lastDocID=" + lastDocID + " df=" + df + " freqFP=" + out.getFilePointer() + " proxFP=" + skipListWriter.proxOutput.getFilePointer()); } + // nocommit -- move this assert up above; every consumer + // shouldn't have to check for this bug: assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; lastDocID = docID; @@ -92,36 +133,56 @@ out.writeVInt(termDocFreq); } - return posWriter; + // nocommit + if (DEBUG) + ((FormatPostingsPositionsWriter) posWriter).desc = desc + ":" + docID; + + if (omitTF) + return null; + else + return posWriter; } - private final TermInfo termInfo = new TermInfo(); // minimize consing - final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); - /** Called when we are done adding docs to this term */ - void finish() throws IOException { - long skipPointer = skipListWriter.writeSkip(out); + void finishTerm(int docCount, boolean isIndexTerm) throws IOException { - // TODO: this is abstraction violation -- we should not - // peek up into parents terms encoding format - termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); + // nocommit -- wasteful we are counting this in two places? + assert docCount == df; + if (DEBUG) + System.out.println("dw.finishTerm termsOut pointer=" + termsOut.getFilePointer() + " freqStart=" + freqStart + " df=" + df); - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); + if (isIndexTerm) + // Write absolute at seek points + termsOut.writeVLong(freqStart); + else + // Write delta between seek points + termsOut.writeVLong(freqStart - lastFreqStart); - if (df > 0) { - parent.termsOut.add(fieldInfo.number, - utf8.result, - utf8.length, - termInfo); + lastFreqStart = freqStart; + + if (df >= skipInterval) { + if (DEBUG) + System.out.println(" writeSkip @ freqFP=" + out.getFilePointer() + " freqStartFP=" + freqStart); + termsOut.writeVLong(skipListWriter.writeSkip(out)-freqStart); } + if (!omitTF) + posWriter.finishTerm(isIndexTerm); + lastDocID = 0; df = 0; + + // nocommit + count = 0; } void close() throws IOException { - out.close(); - posWriter.close(); + if (DEBUG) + System.out.println("docs writer close pointer=" + out.getFilePointer()); + try { + out.close(); + } finally { + posWriter.close(); + } } } Index: src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (working copy) @@ -1,73 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.Directory; - -final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { - - final Directory dir; - final String segment; - final TermInfosWriter termsOut; - final FieldInfos fieldInfos; - final FormatPostingsTermsWriter termsWriter; - final DefaultSkipListWriter skipListWriter; - final int totalNumDocs; - - public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { - super(); - - dir = state.directory; - segment = state.segmentName; - totalNumDocs = state.numDocs; - this.fieldInfos = fieldInfos; - termsOut = new TermInfosWriter(dir, - segment, - fieldInfos, - state.termIndexInterval); - - // TODO: this is a nasty abstraction violation (that we - // peek down to find freqOut/proxOut) -- we need a - // better abstraction here whereby these child consumers - // can provide skip data or not - skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, - termsOut.maxSkipLevels, - totalNumDocs, - null, - null); - - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); - - termsWriter = new FormatPostingsTermsWriter(state, this); - } - - /** Add a new field */ - FormatPostingsTermsConsumer addField(FieldInfo field) { - termsWriter.setField(field); - return termsWriter; - } - - /** Called when we are done adding everything. */ - void finish() throws IOException { - termsOut.close(); - termsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/TermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/TermsConsumer.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsConsumer { + + /** Starts a new term in this field; term ends with U+FFFF + * char */ + abstract DocsConsumer startTerm(char[] text, int start) throws IOException; + + /** Finishes the current term */ + abstract void finishTerm(char[] text, int start, int numDocs) throws IOException; + + /** Called when we are done adding terms to this field */ + abstract void finish() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/TermsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.util.ArrayUtil; - -/** - * NOTE: this API is experimental and will likely change - */ - -abstract class FormatPostingsTermsConsumer { - - /** Adds a new term in this field; term ends with U+FFFF - * char */ - abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException; - - char[] termBuffer; - FormatPostingsDocsConsumer addTerm(String text) throws IOException { - final int len = text.length(); - if (termBuffer == null || termBuffer.length < 1+len) - termBuffer = new char[ArrayUtil.getNextSize(1+len)]; - text.getChars(0, len, termBuffer, 0); - termBuffer[len] = 0xffff; - return addTerm(termBuffer, 0); - } - - /** Called when we are done adding terms to this field */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java (revision 0) @@ -0,0 +1,236 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.BitVector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.Directory; + +class FormatPostingsPositionsReader extends FormatPostingsTermsDictPositionsReader { + + static boolean DEBUG = false; + + final IndexInput proxIn; + IndexInput termsIn; + + FormatPostingsPositionsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException { + assert segmentInfo.getHasProx(); + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, IndexFileNames.PROX_EXTENSION), readBufferSize); + } + + void start(IndexInput termsIn) throws IOException { + this.termsIn = termsIn; + final String codec = termsIn.readString(); + + // nocommit -- refactor all of these to + // "checkCodecVersion" static method somewhere: + if (!codec.equals(FormatPostingsPositionsWriter.CODEC)) + throw new CorruptIndexException("codec mismatch: expected '" + FormatPostingsPositionsWriter.CODEC + "' but got '" + codec + "'"); + + int version = termsIn.readInt(); + if (version != FormatPostingsPositionsWriter.VERSION_START) + throw new CorruptIndexException("version mismatch: expected " + FormatPostingsPositionsWriter.VERSION_START + " but got " + version); + } + + Reader reader(FieldInfo fieldInfo, IndexInput termsIn) { + return new TermsDictReader(termsIn, fieldInfo); + } + + void close() throws IOException { + if (proxIn != null) + proxIn.close(); + } + + class TermsDictReader extends Reader { + + final IndexInput termsIn; + final FieldInfo fieldInfo; + long proxOffset; + + TermsDictReader(IndexInput termsIn, FieldInfo fieldInfo) { + this.termsIn = termsIn; + this.fieldInfo = fieldInfo; + } + + void readTerm(int docFreq, boolean isIndexTerm) throws IOException { + if (DEBUG) + System.out.println(" pr.readterm termsInPointer=" + termsIn.getFilePointer() + " isIndex=" + isIndexTerm); + if (isIndexTerm) + proxOffset = termsIn.readVLong(); + else + proxOffset += termsIn.readVLong(); + if (DEBUG) + System.out.println(" proxOffset=" + proxOffset); + if (positions != null) { + positions.seekPending = true; + positions.skipOffset = proxOffset; + positions.skipPosCount = 0; + } + } + + void close() throws IOException { + } + + SegmentPositionsEnum positions; + + PositionsEnum positions() throws IOException { + + if (positions == null) + // Lazy init + positions = new SegmentPositionsEnum(); + + return positions; + } + + // nocommit -- should we have different reader for + // payload vs no payload? + class SegmentPositionsEnum extends PositionsEnum { + + // nocommit + String desc; + + final IndexInput proxIn; + + final boolean storePayloads; + + boolean seekPending; // True if we must seek before reading next position + boolean payloadPending; // True if we must skip payload beore reading next position + + long skipOffset; + int skipPosCount; + + int position; + int payloadLength; + + SegmentPositionsEnum() { + if (DEBUG) + System.out.println("new pos enum"); + proxIn = (IndexInput) FormatPostingsPositionsReader.this.proxIn.clone(); + storePayloads = fieldInfo.storePayloads; + } + + void skip(long proxOffset, int lastPayloadLength, int numPositions) { + skipOffset = proxOffset; + payloadLength = lastPayloadLength; + assert payloadLength >= 0 || payloadLength == -1; + skipPosCount = numPositions; + seekPending = true; + payloadPending = false; + if (DEBUG) + System.out.println("pr [" + desc + "] skip fp= " + proxOffset + " numPositions=" + numPositions); + } + + void skip(int numPositions) { + skipPosCount += numPositions; + if (DEBUG) + System.out.println("pr [" + desc + "] skip " + numPositions + " positions; now " + skipPosCount); + } + + void catchUp(int currentCount) throws IOException { + if (DEBUG) + System.out.println(" pos catchup: seekPending=" + seekPending + " skipOffset=" + skipOffset + " skipPosCount " + skipPosCount + " vs currentCount " + currentCount); + if (seekPending) { + proxIn.seek(skipOffset); + seekPending = false; + } + + while(skipPosCount > currentCount) + next(); + if (DEBUG) + System.out.println(" pos catchup done"); + positions.init(); + } + + void init() { + if (DEBUG) + System.out.println(" pos init"); + position = 0; + } + + int next() throws IOException { + + if (DEBUG) + System.out.println(" pr.next [" + desc + "]: fp=" + proxIn.getFilePointer() + " return pos=" + position); + + if (storePayloads) { + + if (payloadPending && payloadLength > 0) { + if (DEBUG) + System.out.println(" payload pending: skip " + payloadLength + " bytes"); + proxIn.seek(proxIn.getFilePointer()+payloadLength); + } + + final int code = proxIn.readVInt(); + if ((code & 1) != 0) { + // Payload length has changed + payloadLength = proxIn.readVInt(); + assert payloadLength >= 0; + if (DEBUG) + System.out.println(" new payloadLen=" + payloadLength); + } + assert payloadLength != -1; + + payloadPending = true; + position += code >>> 1; + } else + position += proxIn.readVInt(); + + skipPosCount--; + + // NOTE: the old API actually allowed this... + assert skipPosCount >= 0: "next() was called too many times (more than FormatPostingsDocsEnum.freq() times)"; + + if (DEBUG) + System.out.println(" proxFP=" + proxIn.getFilePointer() + " return pos=" + position); + return position; + } + + int getPayloadLength() { + return payloadLength; + } + + byte[] getPayload(byte[] data, int offset) throws IOException { + + if (!payloadPending) + throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); + + final byte[] retArray; + final int retOffset; + if (data == null || data.length-offset < payloadLength) { + // the array is too small to store the payload data, + // so we allocate a new one + retArray = new byte[payloadLength]; + retOffset = 0; + } else { + retArray = data; + retOffset = offset; + } + + proxIn.readBytes(retArray, retOffset, payloadLength); + payloadPending = false; + return retArray; + } + + public boolean hasPayload() { + return payloadPending && payloadLength > 0; + } + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatPostingsPositionsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -78,6 +78,7 @@ // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTf==false + private boolean flexPostings; // True if postings were written with new flex format public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; @@ -92,6 +93,7 @@ docStoreIsCompoundFile = false; delCount = 0; hasProx = true; + flexPostings = true; } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { @@ -108,6 +110,7 @@ this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; + flexPostings = true; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } @@ -188,6 +191,12 @@ hasProx = input.readByte() == 1; else hasProx = true; + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + flexPostings = input.readByte() == 1; + else + flexPostings = false; + } else { delGen = CHECK_DIR; normGen = null; @@ -294,6 +303,8 @@ si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; + si.hasProx = hasProx; + si.flexPostings = flexPostings; return si; } @@ -517,6 +528,7 @@ output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); + output.writeByte((byte) (flexPostings ? 1:0)); } void setHasProx(boolean hasProx) { @@ -528,6 +540,10 @@ return hasProx; } + boolean getFlexPostings() { + return flexPostings; + } + private void addIfExists(List files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); Index: src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (working copy) @@ -22,42 +22,76 @@ import java.io.IOException; -final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer { +final class FormatPostingsPositionsWriter extends PositionsConsumer { + final static String CODEC = "SingleFilePositionsPayloads"; + final static int VERSION_START = -1; + final static int VERSION_CURRENT = VERSION_START; + final FormatPostingsDocsWriter parent; final IndexOutput out; + IndexOutput termsOut; + boolean omitTF; boolean storePayloads; int lastPayloadLength = -1; + // nocommit + String desc; + FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException { this.parent = parent; omitTF = parent.omitTF; - if (parent.parent.parent.fieldInfos.hasProx()) { + if (state.fieldInfos.hasProx()) { // At least one field does not omit TF, so create the // prox file - final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION); + final String fileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.PROX_EXTENSION); state.flushedFiles.add(fileName); - out = parent.parent.parent.dir.createOutput(fileName); + out = state.directory.createOutput(fileName); parent.skipListWriter.setProxOutput(out); } else // Every field omits TF so we will write no prox file out = null; } + void start(IndexOutput termsOut) throws IOException { + this.termsOut = termsOut; + termsOut.writeString(CODEC); + termsOut.writeInt(VERSION_CURRENT); + } + + long proxStart; + long lastProxStart; + + void startTerm() { + proxStart = out.getFilePointer(); + lastPayloadLength = -1; + } + int lastPosition; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + /** Add a new position & payload */ void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException { assert !omitTF: "omitTF is true"; assert out != null; + if (DEBUG) + if (payload != null) + System.out.println("pw.addPos [" + desc + "]: pos=" + position + " fp=" + out.getFilePointer() + " payload=" + payloadLength + " bytes"); + else + System.out.println("pw.addPos [" + desc + "]: pos=" + position + " fp=" + out.getFilePointer()); final int delta = position - lastPosition; lastPosition = position; if (storePayloads) { + if (DEBUG) + System.out.println(" store payloads"); if (payloadLength != lastPayloadLength) { + if (DEBUG) + System.out.println(" payload len change old=" + lastPayloadLength + " new=" + payloadLength); lastPayloadLength = payloadLength; out.writeVInt((delta<<1)|1); out.writeVInt(payloadLength); @@ -75,11 +109,26 @@ } /** Called when we are done adding positions & payloads */ - void finish() { + void finishDoc() { lastPosition = 0; - lastPayloadLength = -1; } + void finishTerm(boolean isIndexTerm) throws IOException { + assert !omitTF; + + if (DEBUG) + System.out.println("poswriter finishTerm isIndex=" + isIndexTerm + " proxStart=" + proxStart + " pointer=" + termsOut.getFilePointer()); + + if (isIndexTerm) + // Write absolute at seek points + termsOut.writeVLong(proxStart); + else + // Write absolute at seek points + termsOut.writeVLong(proxStart-lastProxStart); + + lastProxStart = proxStart; + } + void close() throws IOException { if (out != null) out.close(); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -36,6 +36,8 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BitVector; +import org.apache.lucene.util.cache.Cache; +import org.apache.lucene.util.cache.SimpleLRUCache; import org.apache.lucene.util.CloseableThreadLocal; /** @@ -43,12 +45,18 @@ */ class SegmentReader extends DirectoryIndexReader { private String segment; - private SegmentInfo si; + // nocommit -- make private again + SegmentInfo si; private int readBufferSize; + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + FieldInfos fieldInfos; private FieldsReader fieldsReader; + FormatPostingsTermsDictReader terms; + FormatPostingsTermsDictDocsReader docsReader; + TermInfosReader tis; TermVectorsReader termVectorsReaderOrig = null; CloseableThreadLocal termVectorsLocal = new CloseableThreadLocal(); @@ -65,6 +73,8 @@ private int rollbackPendingDeleteCount; private boolean readOnly; + private boolean newPostings; + IndexInput freqStream; IndexInput proxStream; @@ -363,15 +373,24 @@ } } - tis = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize); - loadDeletedDocs(); - // make sure that all index files have been read or are kept open - // so that if an index update removes them we'll still have them - freqStream = cfsDir.openInput(segment + ".frq", readBufferSize); - if (anyProx) - proxStream = cfsDir.openInput(segment + ".prx", readBufferSize); + if (si.getFlexPostings()) { + docsReader = new FormatPulsingDocsReader(cfsDir, si, readBufferSize, + new FormatPostingsDocsReader(cfsDir, si, readBufferSize)); + terms = new FormatPostingsTermsDictReader(cfsDir, fieldInfos, segment, + docsReader, + readBufferSize); + } else { + tis = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize); + + // make sure that all index files have been read or are kept open + // so that if an index update removes them we'll still have them + freqStream = cfsDir.openInput(segment + ".frq", readBufferSize); + if (anyProx) + proxStream = cfsDir.openInput(segment + ".prx", readBufferSize); + } + openNorms(cfsDir, readBufferSize); if (doOpenStores && fieldInfos.hasVectors()) { // open term vector files only as needed @@ -474,9 +493,13 @@ clone.storeCFSReader = storeCFSReader; clone.fieldInfos = fieldInfos; - clone.tis = tis; - clone.freqStream = freqStream; - clone.proxStream = proxStream; + if (tis != null) { + clone.tis = tis; + clone.freqStream = freqStream; + clone.proxStream = proxStream; + } else + clone.terms = terms; + clone.termVectorsReaderOrig = termVectorsReaderOrig; @@ -619,6 +642,7 @@ boolean hasReferencedReader = (referencedSegmentReader != null); termVectorsLocal.close(); + perThread.close(); if (hasReferencedReader) { referencedSegmentReader.decRefReaderNotNorms(); @@ -645,13 +669,17 @@ // close everything, nothing is shared anymore with other readers if (tis != null) { tis.close(); + if (freqStream != null) + freqStream.close(); + if (proxStream != null) + proxStream.close(); + } else { + if (docsReader != null) + docsReader.close(); + if (terms != null) + terms.close(); } - if (freqStream != null) - freqStream.close(); - if (proxStream != null) - proxStream.close(); - if (termVectorsReaderOrig != null) termVectorsReaderOrig.close(); @@ -708,12 +736,20 @@ public TermEnum terms() { ensureOpen(); - return tis.terms(); + if (tis == null) + return new TermsDictTermEnum(); + else + return tis.terms(); } public TermEnum terms(Term t) throws IOException { ensureOpen(); - return tis.terms(t); + if (tis == null) { + TermsDictTermEnum terms = new TermsDictTermEnum(); + terms.seek(t); + return terms; + } else + return tis.terms(t); } FieldInfos getFieldInfos() { @@ -738,21 +774,70 @@ public TermDocs termDocs() throws IOException { ensureOpen(); - return new SegmentTermDocs(this); + if (tis == null) + return new TermsDictTermDocs(); + else + return new SegmentTermDocs(this); } public TermPositions termPositions() throws IOException { ensureOpen(); - return new SegmentTermPositions(this); + if (tis == null) + return new TermsDictTermPositions(); + else + return new SegmentTermPositions(this); } + private final CloseableThreadLocal perThread = new CloseableThreadLocal(); + + // nocommit -- move term vectors under here + private static final class PerThread { + TermsDictTermEnum terms; + + // Used for caching the least recently looked-up Terms + Cache termsCache; + } + + private final static int DEFAULT_TERMS_CACHE_SIZE = 1024; + + private PerThread getPerThread() { + PerThread resources = (PerThread) perThread.get(); + if (resources == null) { + resources = new PerThread(); + resources.terms = new TermsDictTermEnum(); + // Cache does not have to be thread-safe, it is only used by one thread at the same time + resources.termsCache = new SimpleLRUCache(DEFAULT_TERMS_CACHE_SIZE); + perThread.set(resources); + } + return resources; + } + public int docFreq(Term t) throws IOException { ensureOpen(); - TermInfo ti = tis.get(t); - if (ti != null) - return ti.docFreq; - else - return 0; + if (tis == null) { + PerThread thread = getPerThread(); + Integer result = (Integer) thread.termsCache.get(t); + if (result == null) { + // Cache miss + final int freq; + thread.terms.seek(t); + if (thread.terms.term() != null && thread.terms.term().equals(t)) { + freq = thread.terms.docFreq(); + } else + freq = 0; + result = new Integer(freq); + thread.termsCache.put(t, result); + } + + return result.intValue(); + + } else { + TermInfo ti = tis.get(t); + if (ti != null) + return ti.docFreq; + else + return 0; + } } public int numDocs() { @@ -769,11 +854,17 @@ } public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { - tis.setIndexDivisor(indexDivisor); + if (tis == null) + terms.setIndexDivisor(indexDivisor); + else + tis.setIndexDivisor(indexDivisor); } public int getTermInfosIndexDivisor() { - return tis.getIndexDivisor(); + if (tis == null) + return terms.getIndexDivisor(); + else + return tis.getIndexDivisor(); } /** @@ -1110,4 +1201,285 @@ norm.dirty = norm.rollbackDirty; } } -} + + // Back compat: implements TermEnum API using new flex API + class TermsDictTermEnum extends TermEnum { + String[] fields; + int fieldUpto; + TermsEnum currentField; + TermsDictTermEnum() { + final int numFields = terms.fields.size(); + fields = new String[numFields]; + + // Each field is guaranteed to have > 0 terms + Iterator it = terms.fields.keySet().iterator(); + int i = 0; + while(it.hasNext()) + fields[i++] = ((FieldInfo) it.next()).name; + Arrays.sort(fields); + fieldUpto = -1; + /* + System.out.println("sr.tdte: " + numFields + " fields"); + for(i=0;i= fields.length-1) + return false; + fieldUpto++; + //System.out.println("sr.tdte: now get new field fieldUpto=" + fieldUpto + " name=" + fields[fieldUpto]); + currentField = terms.getField(fieldInfos.fieldInfo(fields[fieldUpto])).terms(); + } + if (currentField.next()) + // This field still has terms + return true; + else { + // Done producing terms from this field + currentField.close(); + currentField = null; + } + } + } + + public Term term() { + if (currentField != null) { + final String text = currentField.text(); + if (text != null) + return new Term(fields[fieldUpto], text); + } + return null; + } + + public int docFreq() { + if (currentField == null) + return 0; + else + return currentField.docFreq(); + } + + public void close() throws IOException { + if (currentField != null) { + currentField.close(); + currentField = null; + } + fieldUpto = fields.length; + } + + // Seek forward only + public boolean skipTo(Term target) throws IOException { + // Just use seek, if the target is beyond our current + // point, else next(): + + if (fieldUpto >= fields.length) + // Already EOF + return false; + + if (fieldUpto >= 0) { + + final int cmp = target.field.compareTo(fields[fieldUpto]); + if (cmp < 0) + // Target is before our current term + return next(); + else if (cmp == 0) { + final int cmp2 = target.text.compareTo(currentField.text()); + if (cmp2 < 0) + // Target is before our current term + return next(); + } + } + + // OK target is in the future, so just seek + return seek(target); + } + + public boolean seek(Term target) throws IOException { + + if (currentField == null || !fields[fieldUpto].equals(target.field)) { + // Seek field + if (currentField != null) { + currentField.close(); + currentField = null; + } + + // nocommit -- binary search + fieldUpto = 0; + int cmp = 0; + + while(fieldUpto < fields.length) { + cmp = target.field.compareTo(fields[fieldUpto]); + if (cmp == 0) + break; + else if (cmp < 0) { + fieldUpto--; + return next(); + } + fieldUpto++; + } + + if (fieldUpto == fields.length) + return false; + + currentField = terms.getField(fieldInfos.fieldInfo(fields[fieldUpto])).terms(); + + assert currentField != null; + assert fields[fieldUpto].equals(target.field); + } + + // Field matches; now seek text + currentField.seek(target.text); + return currentField.text() != null; + } + } + + // Back compat + class TermsDictTermDocs implements TermDocs { + + String currentField; + TermsEnum currentFieldTerms; + DocsEnum docs; + + public void close() throws IOException { + if (docs != null) { + docs.close(); + docs = null; + } + if (currentFieldTerms != null) { + currentFieldTerms.close(); + currentFieldTerms = null; + } + } + + public void seek(TermEnum termEnum) throws IOException { + // nocommit -- optimize for the special cases here + seek(termEnum.term()); + } + + public boolean skipTo(int target) throws IOException { + if (docs == null) return false; + return docs.skip(target); + } + + public int read(int[] docs, int[] freqs) throws IOException { + if (this.docs == null) return 0; + return this.docs.read(docs, freqs); + } + + public void seek(Term term) throws IOException { + + if (DEBUG) + System.out.println("\nwrapper termdocs.seek term=" + term); + + if (currentField != null && !term.field.equals(currentField)) { + if (DEBUG) + System.out.println(" clear current field " + currentField); + if (currentFieldTerms != null) { + currentFieldTerms.close(); + currentFieldTerms = null; + } + currentField = null; + } + + if (currentFieldTerms == null) { + currentField = term.field; + TermsProducer field = terms.getField(fieldInfos.fieldInfo(term.field)); + if (DEBUG) + System.out.println(" lookup field=" + field); + if (field != null) { + currentFieldTerms = field.terms(); + if (DEBUG) + System.out.println(" got terms=" + currentFieldTerms); + } + } + + if (currentFieldTerms != null) { + if (currentFieldTerms.seek(term.text)) { + if (DEBUG) + System.out.println(" seek true: " + currentFieldTerms.text()); + if (currentFieldTerms.text().equals(term.text)) + docs = currentFieldTerms.docs(deletedDocs); + else + docs = null; + } else + docs = null; + } else + docs = null; + } + + public int doc() { + if (docs == null) return 0; + return docs.doc(); + } + + public int freq() { + if (docs == null) return 0; + return docs.freq(); + } + + public boolean next() throws IOException { + if (docs == null) return false; + return docs.next(); + } + } + + // Back compat + final class TermsDictTermPositions extends TermsDictTermDocs implements TermPositions { + + PositionsEnum positions; + + public void seek(TermEnum termEnum) throws IOException { + super.seek(termEnum); + if (docs != null) + positions = docs.positions(); + } + + public boolean skipTo(int target) throws IOException { + boolean result = super.skipTo(target); + if (result && docs != null) + positions = docs.positions(); + else + positions = null; + return result; + } + + public int read(int[] docs, int[] freqs) throws IOException { + throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); + } + + public void seek(Term term) throws IOException { + super.seek(term); + if (docs != null) + positions = docs.positions(); + else + positions = null; + } + + public boolean next() throws IOException { + boolean result = super.next(); + if (result && docs != null) + positions = docs.positions(); + else + positions = null; + return result; + } + + public int nextPosition() throws IOException { + return positions.next(); + } + + public int getPayloadLength() { + return positions.getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return positions.getPayload(data, offset); + } + + public boolean isPayloadAvailable() { + return positions.hasPayload(); + } + } +} \ No newline at end of file Index: src/java/org/apache/lucene/index/SegmentTermEnum.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermEnum.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentTermEnum.java (working copy) @@ -20,6 +20,10 @@ import java.io.IOException; import org.apache.lucene.store.IndexInput; +/** + * @deprecated No longer used with flex indexing, except for + * reading old segments */ + final class SegmentTermEnum extends TermEnum implements Cloneable { private IndexInput input; FieldInfos fieldInfos; Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java (revision 0) @@ -0,0 +1,449 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; +import java.util.HashMap; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitVector; + +class FormatPostingsTermsDictReader extends FieldsProducer { + private final IndexInput in; + private final IndexInput indexIn; + private final int indexInterval; + private final FormatPostingsTermsDictDocsReader docs; + + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + private int indexDivisor = 1; + private boolean anyIndexRead; + + int totalIndexInterval; + + private final FieldInfos fieldInfos; + final Map fields = new HashMap(); + private final String segment; + + FormatPostingsTermsDictReader(Directory dir, FieldInfos fieldInfos, String segment, FormatPostingsTermsDictDocsReader docs, int readBufferSize) throws IOException { + in = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), readBufferSize); + boolean success = false; + try { + indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION), readBufferSize); + success = true; + } finally { + if (!success) + in.close(); + } + + success = false; + try { + + this.fieldInfos = fieldInfos; + this.segment = segment; + int format = in.readInt(); + if (format != FormatPostingsTermsDictWriter.FORMAT) + throw new CorruptIndexException("format mismatch"); + + final long dirOffset = in.readLong(); + indexInterval = in.readInt(); + totalIndexInterval = indexInterval; + + this.docs = docs; + docs.start(in); + in.seek(dirOffset); + + final int numFields = in.readInt(); + // nocommit -- why did i want to order by field number? + //int lastFieldNumber = -1; + for(int i=0;i lastFieldNumber; + //lastFieldNumber = field; + final long numTerms = in.readLong(); + final long termsStartPointer = in.readLong(); + final long indexStartPointer = in.readLong(); + if (numTerms > 0) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + fields.put(fieldInfo, new FieldReader(fieldInfo, numTerms, termsStartPointer, indexStartPointer)); + } + } + success = true; + } finally { + if (!success) { + try { + in.close(); + } finally { + indexIn.close(); + } + } + } + } + + public TermsProducer getField(FieldInfo fieldInfo) { + return (TermsProducer) fields.get(fieldInfo); + } + + // nocommit -- not consistent with FPTDW's close, which + // closes the consumer + public void close() throws IOException { + try { + in.close(); + } finally { + indexIn.close(); + } + } + + /** + *

Sets the indexDivisor, which subsamples the number + * of indexed terms loaded into memory. This has a + * similar effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1.

+ * + * NOTE: you must call this before the term + * index is loaded. If the index is already loaded, + * an IllegalStateException is thrown. + * + + @throws IllegalStateException if the term index has + * already been loaded into memory. + */ + public void setIndexDivisor(int indexDivisor) throws IllegalStateException { + if (indexDivisor < 1) + throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor); + + if (anyIndexRead) + throw new IllegalStateException("index terms are already loaded"); + + this.indexDivisor = indexDivisor; + totalIndexInterval = indexInterval * indexDivisor; + } + + /** Returns the indexDivisor. + * @see #setIndexDivisor + */ + public int getIndexDivisor() { + return indexDivisor; + } + + // nocommit -- static? + private class FieldReader extends TermsProducer { + + final long numTerms; + final FieldInfo fieldInfo; + final long indexStartPointer; + final long termsStartPointer; + + // TODO: genericize "skipper" API so that we could swap + // in a multi-level skipper, here, instead of flat one: + // TODO: we could save mem here by packing our own shared char[]'s + String[] indexTerms; + long[] indexOffsets; + + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long indexStartPointer) { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + assert numTerms > 0; + this.indexStartPointer = indexStartPointer; + this.termsStartPointer = termsStartPointer; + } + + synchronized final void readIndex() throws IOException { + if (indexTerms != null) + return; + + final int indexSize = (int) (1+(numTerms-1)/totalIndexInterval); + + if (DEBUG) + System.out.println(" tdr.readIndex field=" + fieldInfo.name + " numTerms=" + numTerms + " indexSize=" + indexSize + " indexSeek=" + indexStartPointer + " segment=" + segment + " indexDivisor=" + indexDivisor); + + IndexInput in = (IndexInput) indexIn.clone(); + in.seek(indexStartPointer); + + indexTerms = new String[indexSize]; + indexOffsets = new long[indexSize]; + + if (DEBUG) + System.out.println("read index for field=" + fieldInfo.name); + + long pointer = termsStartPointer; + final DeltaBytesReader bytesReader = new DeltaBytesReader(in); + final int numIndexTerms = (int) (1+(numTerms-1)/indexInterval); + int upto = 0; + for(int i=0;i= numTerms) { + termUpto++; + return false; + } + if (DEBUG) { + System.out.println("tdr.next: field=" + fieldInfo.name + " termsInPointer=" + in.getFilePointer() + " vs len=" + in.length() + " isIndex=" + (termUpto%indexInterval==0) + " this=" + this); + //new Throwable().printStackTrace(System.out); + } + bytesReader.read(); + docFreq = in.readVInt(); + if (DEBUG) + System.out.println(" text=" + bytesReader.text() + " freq=" + docFreq); + docs.readTerm(docFreq, termUpto % indexInterval == 0); + termUpto++; + if (DEBUG) + System.out.println(" termUpto=" + termUpto + " vs numTerms=" + numTerms + " fp=" + in.getFilePointer()); + return true; + } + + public int docFreq() { + if (termUpto >= 1+numTerms) + return 0; + else + return docFreq; + } + + public String text() { + // nocommit -- really necessary? + if (termUpto >= 1+numTerms) + return null; + else + return bytesReader.text(); + } + + public long ord() { + return termUpto-1; + } + + public DocsEnum docs(BitVector deletedDocs) throws IOException { + doSkip = false; + // nocommit + DocsEnum docsEnum = docs.docs(deletedDocs); + docsEnum.desc = fieldInfo.name + ":" + bytesReader.text(); + return docsEnum; + } + + public void close() throws IOException { + in.close(); + docs.close(); + } + } + } + + private static class DeltaBytesReader { + private byte[] bytes; + final UnicodeUtil.UTF16Result chars = new UnicodeUtil.UTF16Result(); + final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); + private int length; + final IndexInput in; + boolean started; + + DeltaBytesReader(IndexInput in) { + this.in = in; + bytes = new byte[10]; + } + + void reset(String text) { + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); + if (utf8.length > bytes.length) + bytes = ArrayUtil.grow(bytes, utf8.length); + System.arraycopy(utf8.result, 0, + this.bytes, 0, utf8.length); + this.length = utf8.length; + chars.copyText(text); + } + + String text() { + // nocommit -- cache this? + return new String(chars.result, 0, chars.length); + } + + int compareTo(String other) { + + final int otherLength = other.length(); + final int minLength; + if (otherLength < chars.length) + minLength = otherLength; + else + minLength = chars.length; + + for(int i=0;i otherC) + return 1; + } + + if (chars.length < otherLength) + return -1; + else if (chars.length > otherLength) + return 1; + else + return 0; + } + + void read() throws IOException { + //System.out.println("terms reader fp=" + in.getFilePointer() + " this=" + this); + final int start = in.readVInt(); + final int suffix = in.readVInt(); + //System.out.println(" start=" + start + " suffix=" + suffix); + assert start <= length: "start=" + start + " length=" + length; + + if (start + suffix > bytes.length) + bytes = ArrayUtil.grow(bytes, start+suffix); + in.readBytes(bytes, start, suffix); + length = start + suffix; + + // TODO: conversion could be incremental + UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars); + started = true; + } + } + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitVector; + +/** TermsDictReader interacts with a single instance of this + * to manage creation of multiple docs enum + * instances. */ +abstract class FormatPostingsTermsDictDocsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a docs enum for the last term read */ + abstract DocsEnum docs(BitVector deletedDocs) throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + /** Returns a new private reader for stepping through + * terms, getting DocsEnum. */ + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictDocsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictWriter.java (revision 0) @@ -0,0 +1,244 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; + +/** + * Writes terms dict and interacts with docs/positions + * consumers to write the postings files. + * + * The [new] terms dict format is field-centric: each field + * has its own section in the file. Fields are written in + * UTF16 string comparison order. Within each field, each + * term's text is written in UTF16 string comparison order. + */ + +class FormatPostingsTermsDictWriter extends FieldsConsumer { + + // Initial format + public static final int FORMAT = -1; + + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + public static final int FORMAT_CURRENT = FORMAT; + + private final int indexInterval; + private final DeltaBytesWriter termWriter; + private final DeltaBytesWriter termIndexWriter; + + final IndexOutput out; + final IndexOutput indexOut; + final DocsConsumer consumer; + final FieldInfos fieldInfos; + FieldInfo currentField; + + private List fields = new ArrayList(); + + // nocommit + private String segment; + + FormatPostingsTermsDictWriter(SegmentWriteState state, DocsConsumer consumer) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_EXTENSION); + out = state.directory.createOutput(termsFileName); + state.flushedFiles.add(termsFileName); + this.segment = state.segmentName; + + if (DEBUG) + System.out.println("tdw: write to segment=" + state.segmentName); + + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, IndexFileNames.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexFileName); + state.flushedFiles.add(indexFileName); + + fieldInfos = state.fieldInfos; + indexInterval = state.termIndexInterval; + + // Count indexed fields up front + final int numFields = fieldInfos.size(); + + out.writeInt(FORMAT_CURRENT); // write format + out.writeLong(0); // leave space for end index pointer + out.writeInt(indexInterval); // write indexInterval + + termWriter = new DeltaBytesWriter(out); + termIndexWriter = new DeltaBytesWriter(indexOut); + currentField = null; + this.consumer = consumer; + + consumer.start(out); // have consumer write its format/header + } + + TermsConsumer addField(FieldInfo field) { + if (DEBUG) + System.out.println("tdw.addField: field=" + field.name); + assert currentField == null || currentField.name.compareTo(field.name) < 0; + currentField = field; + TermsConsumer terms = new TermsWriter(field, consumer); + fields.add(terms); + return terms; + } + + void finish() throws IOException { + try { + final long indexPointer = out.getFilePointer(); + final int fieldCount = fields.size(); + out.writeInt(fieldCount); + for(int i=0;i>>1; + if ((code & 1) != 0) + doc.numPositions = 1; + else + doc.numPositions = termsIn.readVInt(); + + if (doc.numPositions > doc.positions.length) + doc.reallocPositions(doc.numPositions); + + int position = 0; + int payloadLength = -1; + + for(int j=0;j>> 1; + if ((code2 & 1) != 0) + payloadLength = termsIn.readVInt(); + if (payloadLength > 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + termsIn.readBytes(pos.payload, 0, payloadLength); + } + } else + position += code2; + pos.pos = position; + pos.payloadLength = payloadLength; + } + } + doc.docID = docID; + } + + } else { + if (DEBUG) + System.out.println(" not pulsed pass isIndex=" + pendingIndexTerm); + + postingsReader.readTerm(docFreq, pendingIndexTerm); + pendingIndexTerm = false; + } + } + + public void close() throws IOException { + postingsReader.close(); + } + + final PulsingDocsEnum docsEnum = new PulsingDocsEnum(); + + DocsEnum docs(BitVector deletedDocs) throws IOException { + if (docFreq <= maxPulsingDocFreq) { + docsEnum.reset(deletedDocs); + return docsEnum; + } else + return postingsReader.docs(deletedDocs); + } + + class PulsingDocsEnum extends DocsEnum { + int nextRead; + private BitVector deletedDocs; + private Document doc; + + void close() {} + + void reset(BitVector deletedDocs) { + this.deletedDocs = deletedDocs; + nextRead = 0; + } + + boolean next() { + while(true) { + if (nextRead >= docFreq) + return false; + else { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) + return true; + } + } + } + + int read(int[] retDocs, int[] retFreqs) { + final int limit; + int i=0; + // nocommit -- ob1? + while(nextRead < docFreq) { + doc = docs[nextRead++]; + if (deletedDocs == null || !deletedDocs.get(doc.docID)) { + retDocs[i] = doc.docID; + if (omitTF) + retFreqs[i] = 0; + else + retFreqs[i] = doc.numPositions; + i++; + } + } + return i; + } + + int doc() { + assert doc.docID >= 0: "got docID=" + doc.docID; + return doc.docID; + } + + int ord() { + assert nextRead <= docFreq; + return nextRead-1; + } + + int freq() { + return doc.numPositions; + } + + class PulsingPositionsEnum extends PositionsEnum { + int nextRead; + FormatPulsingDocsWriter.Position pos; + + void reset() { + nextRead = 0; + } + + int next() { + assert nextRead < doc.numPositions; + pos = doc.positions[nextRead++]; + return pos.pos; + } + + int getPayloadLength() { + return pos.payloadLength; + } + + boolean hasPayload() { + return pos.payloadLength > 0; + } + + byte[] getPayload(byte[] data, int offset) { + // nocommit -- inefficient + System.arraycopy(pos.payload, 0, data, offset, pos.payloadLength); + return data; + } + } + + final PulsingPositionsEnum positions = new PulsingPositionsEnum(); + + PositionsEnum positions() throws IOException { + positions.reset(); + return positions; + } + + boolean skip(int target) throws IOException { + while(next()) { + if (doc() >= target) + return true; + } + return false; + } + } + } +} Index: src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPulsingDocsWriter.java (revision 0) @@ -0,0 +1,278 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Consumes doc & freq, writing them using the current + * index file format */ + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.IndexOutput; + +final class FormatPulsingDocsWriter extends DocsConsumer { + + final static String CODEC = "PulsedPostings"; + final static int VERSION_START = -1; + final static int VERSION_CURRENT = VERSION_START; + + IndexOutput termsOut; + + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + + boolean omitTF; + boolean storePayloads; + + // Starts a new term + FieldInfo fieldInfo; + + // nocommit + String desc; + + static class Document { + int docID; + int termDocFreq; + int numPositions; + Position[] positions; + Document() { + positions = new Position[1]; + positions[0] = new Position(); + } + + void reallocPositions(int minSize) { + final Position[] newArray = new Position[ArrayUtil.getNextSize(minSize)]; + System.arraycopy(positions, 0, newArray, 0, positions.length); + for(int i=positions.length;i maxPulsingDocFreq docs + + static class Position { + byte[] payload; + int pos; + int payloadLength; + } + + // nocommit -- lazy init this? ie, if every single term + // was pulsed then we never need to use this fallback? + // Fallback writer for non-pulsed terms: + final DocsConsumer postingsDocsWriter; + + /** If docFreq <= maxPulsingDocFreq, its postings are + * inlined into terms dict */ + + FormatPulsingDocsWriter(SegmentWriteState state, int maxPulsingDocFreq, DocsConsumer postingsDocsWriter) throws IOException { + super(); + + pendingDocs = new Document[maxPulsingDocFreq]; + for(int i=0;i 0) { + if (pos.payload == null || payloadLength > pos.payload.length) + pos.payload = new byte[ArrayUtil.getNextSize(payloadLength)]; + System.arraycopy(payload, payloadOffset, pos.payload, 0, payloadLength); + pos.payloadLength = payloadLength; + } else + pos.payloadLength = 0; + } + void finishDoc() { + assert currentDoc.numPositions == currentDoc.termDocFreq; + } + void finishTerm(boolean isIndexTerm) {} + void close() {} + } + + final PositionsWriter posWriter = new PositionsWriter(); + + PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { + + assert docID >= 0: "got docID=" + docID; + + if (DEBUG) + System.out.println("PW.addDoc: docID=" + docID + " pendingDocCount=" + pendingDocCount + " vs " + pendingDocs.length + " pulsed=" + pulsed); + + if (!pulsed && pendingDocCount == pendingDocs.length) { + + // OK we just crossed the threshold, this term should + // now be "pulsed" into the main postings codec: + postingsDocsWriter.startTerm(); + if (DEBUG) + System.out.println(" now flush buffer"); + for(int i=0;i= target */ + abstract boolean skip(int target) throws IOException; + + abstract boolean next() throws IOException; + + abstract int doc(); + + abstract int freq(); + + abstract int ord(); + + abstract int read(int[] docs, int[] freqs) throws IOException; + + // nocommit -- maybe move this up to TermsEnum? that + // would disallow changing positions format/reader of each + // doc, though + /** Don't call next() or skipTo() or read() until you're + * done consuming the positions */ + abstract PositionsEnum positions() throws IOException; + + abstract void close() throws IOException; +} Index: src/java/org/apache/lucene/index/FieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsConsumer.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Abstract API that consumes terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" with the postings (write it into + * the index in a specific format). + * + * NOTE: this API is experimental and will likely change + */ +abstract class FieldsConsumer { + + /** Add a new field */ + abstract TermsConsumer addField(FieldInfo field) throws IOException; + + /** Called when we are done adding everything. */ + abstract void finish() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FieldsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -/** Abstract API that consumes terms, doc, freq, prox and - * payloads postings. Concrete implementations of this - * actually do "something" with the postings (write it into - * the index in a specific format). - * - * NOTE: this API is experimental and will likely change - */ -abstract class FormatPostingsFieldsConsumer { - - /** Add a new field */ - abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; - - /** Called when we are done adding everything. */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, @@ -481,10 +482,12 @@ private final void mergeTerms() throws CorruptIndexException, IOException { - SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); + SegmentWriteState state = new SegmentWriteState(null, directory, segment, fieldInfos, null, mergedDocs, 0, termIndexInterval); + + final DocsConsumer docsWriter = new FormatPostingsDocsWriter(state); + final DocsConsumer pulsingWriter = new FormatPulsingDocsWriter(state, 1, docsWriter); + final FieldsConsumer consumer = new FormatPostingsTermsDictWriter(state, pulsingWriter); - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); - try { queue = new SegmentMergeQueue(readers.size()); @@ -498,7 +501,7 @@ boolean omitTF; - private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { + private final void mergeTermInfos(final FieldsConsumer consumer) throws CorruptIndexException, IOException { int base = 0; final int readerCount = readers.size(); for (int i = 0; i < readerCount; i++) { @@ -525,7 +528,7 @@ SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; String currentField = null; - FormatPostingsTermsConsumer termsConsumer = null; + TermsConsumer termsConsumer = null; while (queue.size() > 0) { int matchSize = 0; // pop matching terms @@ -572,6 +575,8 @@ return delCounts; } + private char[] termBuffer; + /** Process postings from multiple segments all positioned on the * same term. Writes out merged entries into freqOutput and * the proxOutput streams. @@ -582,10 +587,17 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) + private final int appendPostings(final TermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) throws CorruptIndexException, IOException { - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text); + final String text = smis[0].term.text; + final int len = text.length(); + if (termBuffer == null || termBuffer.length < 1+len) + termBuffer = new char[ArrayUtil.getNextSize(1+len)]; + text.getChars(0, len, termBuffer, 0); + termBuffer[len] = 0xffff; + + final DocsConsumer docConsumer = termsConsumer.startTerm(termBuffer, 0); int df = 0; for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; @@ -598,13 +610,18 @@ while (postings.next()) { df++; int doc = postings.doc(); - if (docMap != null) + if (docMap != null) { doc = docMap[doc]; // map around deletions + assert doc != -1: "postings enum returned deleted docID " + postings.doc() + " freq=" + postings.freq() + " df=" + df; + } doc += base; // convert to merged space final int freq = postings.freq(); - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + final PositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); + // nocommit -- omitTF should be "private", and this + // code (and FreqProxTermsWriter) should instead + // check if posConsumer is null? if (!omitTF) { for (int j = 0; j < freq; j++) { final int position = postings.nextPosition(); @@ -616,12 +633,13 @@ } posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } - posConsumer.finish(); + posConsumer.finishDoc(); } } } - docConsumer.finish(); + termsConsumer.finishTerm(termBuffer, 0, df); + return df; } Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -88,21 +88,25 @@ } } + final int numAllFields = allFields.size(); + // Sort by field name Collections.sort(allFields); - final int numAllFields = allFields.size(); // TODO: allow Lucene user to customize this consumer: - final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); + final DocsConsumer docsWriter = new FormatPostingsDocsWriter(state); + final DocsConsumer pulsingWriter = new FormatPulsingDocsWriter(state, 1, docsWriter); + final FieldsConsumer consumer = new FormatPostingsTermsDictWriter(state, pulsingWriter); + /* Current writer chain: - FormatPostingsFieldsConsumer - -> IMPL: FormatPostingsFieldsWriter - -> FormatPostingsTermsConsumer - -> IMPL: FormatPostingsTermsWriter - -> FormatPostingsDocConsumer - -> IMPL: FormatPostingsDocWriter - -> FormatPostingsPositionsConsumer + FieldsConsumer + -> IMPL: FormatPostingsTermsDictWriter + -> TermsConsumer + -> IMPL: FormatPostingsTermsDictWriter.TermsWriter + -> DocsConsumer + -> IMPL: FormatPostingsDocsWriter + -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ @@ -155,7 +159,7 @@ * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, - FormatPostingsFieldsConsumer consumer) + FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; @@ -172,7 +176,7 @@ assert result; } - final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); + final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; @@ -196,11 +200,15 @@ termStates[numToMerge++] = mergeStates[i]; } - final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset); + final char[] termText = termStates[0].text; + final int termTextOffset = termStates[0].textOffset; + final DocsConsumer docConsumer = termsConsumer.startTerm(termText, termTextOffset); + // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. + int numDocs = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -209,8 +217,9 @@ minState = termStates[i]; final int termDocFreq = minState.termFreq; + numDocs++; - final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); + final PositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); final ByteSliceReader prox = minState.prox; @@ -241,7 +250,7 @@ posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } //End for - posConsumer.finish(); + posConsumer.finishDoc(); } if (!minState.nextDoc()) { @@ -269,7 +278,7 @@ } } - docConsumer.finish(); + termsConsumer.finishTerm(termText, termTextOffset, numDocs); } termsConsumer.finish(); Index: src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- src/java/org/apache/lucene/index/TermsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/TermsEnum.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.util.BitVector; + +/** + * NOTE: this API is experimental and will likely change + */ + +abstract class TermsEnum { + + // nocommit -- char[] or byte[] version? + /** Seeks to the specified term. Returns true if the term + * exists. */ + abstract boolean seek(String text) throws IOException; + + /** Increments the enumeration to the next element. True if one exists.*/ + abstract boolean next() throws IOException; + + // nocommit -- char[] or byte[] version? + /** Returns the text for current Term in the enumeration.*/ + abstract String text(); + + /** Returns the docFreq of the current Term in the enumeration.*/ + abstract int docFreq(); + + /** Get DocsEnum for the current term. You should not + * call {@link #next()} or {@link #seek()} until you're + * done using the DocsEnum. */ + abstract DocsEnum docs(BitVector deletedDocs) throws IOException; + + /** Closes the enumeration to further activity, freeing resources. */ + abstract void close() throws IOException; +} + Index: src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (working copy) @@ -1,71 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { - - final FormatPostingsFieldsWriter parent; - final FormatPostingsDocsWriter docsWriter; - final TermInfosWriter termsOut; - FieldInfo fieldInfo; - - FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { - super(); - this.parent = parent; - termsOut = parent.termsOut; - docsWriter = new FormatPostingsDocsWriter(state, this); - } - - void setField(FieldInfo fieldInfo) { - this.fieldInfo = fieldInfo; - docsWriter.setField(fieldInfo); - } - - char[] currentTerm; - int currentTermStart; - - long freqStart; - long proxStart; - - /** Adds a new term in this field */ - FormatPostingsDocsConsumer addTerm(char[] text, int start) { - currentTerm = text; - currentTermStart = start; - - // TODO: this is abstraction violation -- ideally this - // terms writer is not so "invasive", looking for file - // pointers in its child consumers. - freqStart = docsWriter.out.getFilePointer(); - if (docsWriter.posWriter.out != null) - proxStart = docsWriter.posWriter.out.getFilePointer(); - - parent.skipListWriter.resetSkip(); - - return docsWriter; - } - - /** Called when we are done adding terms to this field */ - void finish() { - } - - void close() throws IOException { - docsWriter.close(); - } -} Index: src/java/org/apache/lucene/index/FieldsProducer.java =================================================================== --- src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) +++ src/java/org/apache/lucene/index/FieldsProducer.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** Abstract API that provides terms, doc, freq, prox and + * payloads postings. Concrete implementations of this + * actually do "something" to read the postings from some + * store. + * + * NOTE: this API is experimental and will likely change + */ + +abstract class FieldsProducer { + // TODO: field iteration API? + abstract TermsProducer getField(FieldInfo fieldInfo) throws IOException; +} Index: src/java/org/apache/lucene/index/TermInfo.java =================================================================== --- src/java/org/apache/lucene/index/TermInfo.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfo.java (working copy) @@ -17,7 +17,10 @@ * limitations under the License. */ -/** A TermInfo is the record of information stored for a term.*/ +/** A TermInfo is the record of information stored for a + * term + * @deprecated This class is no longer used in flexible + * indexing. */ final class TermInfo { /** The number of documents which contain the term. */ Index: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) +++ src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexInput; + +abstract class FormatPostingsTermsDictPositionsReader { + + abstract class Reader { + abstract void readTerm(int docFreq, boolean isIndexTerm) throws IOException; + + /** Returns a pos enum for the last term read */ + abstract PositionsEnum positions() throws IOException; + + abstract void close() throws IOException; + } + + abstract void start(IndexInput termsIn) throws IOException; + + abstract Reader reader(FieldInfo fieldInfo, IndexInput termsIn) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/FormatPostingsTermsDictPositionsReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -27,8 +27,9 @@ /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the - * set. */ - + * set + * @deprecated This class has been replaced by + * FormatPostingsTermsDictReader, except for reading old segments. */ final class TermInfosReader { private Directory directory; private String segment; Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -531,7 +531,8 @@ synchronized private void initFlushState(boolean onlyDocStore) { initSegmentName(onlyDocStore); - flushState = new SegmentWriteState(this, directory, segment, docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval()); + flushState = new SegmentWriteState(this, directory, segment, docFieldProcessor.fieldInfos, + docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval()); } /** Flush all pending docs to a new segment */ @@ -594,13 +595,19 @@ return flushState.numDocs; } + boolean DEBUG = FormatPostingsPositionsReader.DEBUG; + /** Build compound file for the segment we just flushed */ void createCompoundFile(String segment) throws IOException { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); Iterator it = flushState.flushedFiles.iterator(); - while(it.hasNext()) - cfsWriter.addFile((String) it.next()); + while(it.hasNext()) { + final String fileName = (String) it.next(); + if (DEBUG) + System.out.println("make cfs " + fileName); + cfsWriter.addFile(fileName); + } // Perform the merge cfsWriter.close(); @@ -974,14 +981,17 @@ // Delete by query IndexSearcher searcher = new IndexSearcher(reader); iter = deletesFlushed.queries.entrySet().iterator(); + //System.out.println("DW: flush delete by query"); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Query query = (Query) entry.getKey(); + //System.out.println("\n del query=" + query.toString()); int limit = ((Integer) entry.getValue()).intValue(); Weight weight = query.weight(searcher); Scorer scorer = weight.scorer(reader); while(scorer.next()) { final int docID = scorer.doc(); + //System.out.println(" del docID=" + docID); if (docIDStart + docID >= limit) break; reader.deleteDocument(docID); Index: src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (revision 718730) +++ src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (working copy) @@ -1,32 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.store.IndexInput; - -abstract class FormatPostingsPositionsConsumer { - - /** Add a new position & payload. If payloadLength > 0 - * you must read those bytes from the IndexInput. */ - abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; - - /** Called when we are done adding positions & payloads */ - abstract void finish() throws IOException; -} Index: src/java/org/apache/lucene/index/TermInfosWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 718730) +++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy) @@ -24,8 +24,10 @@ import org.apache.lucene.util.UnicodeUtil; /** This stores a monotonically increasing set of pairs in a - Directory. A TermInfos can be written once, in order. */ - + Directory. A TermInfos can be written once, in order. + * + * @deprecated This class has been replaced by + * FormatPostingsTermsDictWriter. */ final class TermInfosWriter { /** The file format version, a negative number. */ public static final int FORMAT = -3; @@ -36,193 +38,4 @@ // NOTE: always change this if you switch to a new format! public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; - - private FieldInfos fieldInfos; - private IndexOutput output; - private TermInfo lastTi = new TermInfo(); - private long size; - - // TODO: the default values for these two parameters should be settable from - // IndexWriter. However, once that's done, folks will start setting them to - // ridiculous values and complaining that things don't work well, as with - // mergeFactor. So, let's wait until a number of folks find that alternate - // values work better. Note that both of these values are stored in the - // segment, so that it's safe to change these w/o rebuilding all indexes. - - /** Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful.*/ - int indexInterval = 128; - - /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, - * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in - * smaller indexes, greater acceleration, but fewer accelerable cases, while - * smaller values result in bigger indexes, less acceleration and more - * accelerable cases. More detailed experiments would be useful here. */ - int skipInterval = 16; - - /** Expert: The maximum number of skip levels. Smaller values result in - * slightly smaller indexes, but slower skipping in big posting lists. - */ - int maxSkipLevels = 10; - - private long lastIndexPointer; - private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; - private int lastFieldNumber = -1; - - private TermInfosWriter other; - private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); - - TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval) - throws IOException { - initialize(directory, segment, fis, interval, false); - other = new TermInfosWriter(directory, segment, fis, interval, true); - other.other = this; - } - - private TermInfosWriter(Directory directory, String segment, FieldInfos fis, - int interval, boolean isIndex) throws IOException { - initialize(directory, segment, fis, interval, isIndex); - } - - private void initialize(Directory directory, String segment, FieldInfos fis, - int interval, boolean isi) throws IOException { - indexInterval = interval; - fieldInfos = fis; - isIndex = isi; - output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT_CURRENT); // write format - output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels - assert initUTF16Results(); - } - - void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); - add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); - } - - // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; - - // Currently used only by assert statements - private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); - return true; - } - - // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { - - if (lastFieldNumber != fieldNumber) { - final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); - // If there is a field named "" (empty string) then we - // will get 0 on this comparison, yet, it's "OK". But - // it's not OK if two different field numbers map to - // the same name. - if (cmp != 0 || lastFieldNumber != -1) - return cmp; - } - - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); - final int len; - if (utf16Result1.length < utf16Result2.length) - len = utf16Result1.length; - else - len = utf16Result2.length; - - for(int i=0;i, TermInfo> pair to the set. - Term must be lexicographically greater than all previous Terms added. - TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) - throws IOException { - - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : - "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); - - assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; - assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; - - if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term - - writeTerm(fieldNumber, termBytes, termBytesLength); // write term - - output.writeVInt(ti.docFreq); // write doc freq - output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers - output.writeVLong(ti.proxPointer - lastTi.proxPointer); - - if (ti.docFreq >= skipInterval) { - output.writeVInt(ti.skipOffset); - } - - if (isIndex) { - output.writeVLong(other.output.getFilePointer() - lastIndexPointer); - lastIndexPointer = other.output.getFilePointer(); // write pointer - } - - lastFieldNumber = fieldNumber; - lastTi.set(ti); - size++; - } - - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) - throws IOException { - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute prefix in common with last term: - int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; - while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) - break; - start++; - } - - final int length = termBytesLength - start; - output.writeVInt(start); // write shared prefix length - output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes - output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - byte[] newArray = new byte[(int) (termBytesLength*1.5)]; - System.arraycopy(lastTermBytes, 0, newArray, 0, start); - lastTermBytes = newArray; - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; - } - - /** Called to complete TermInfos creation. */ - void close() throws IOException { - output.seek(4); // write size after format - output.writeLong(size); - output.close(); - - if (!isIndex) - other.close(); - } - } Index: src/java/org/apache/lucene/index/PositionsConsumer.java =================================================================== --- src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) +++ src/java/org/apache/lucene/index/PositionsConsumer.java (revision 0) @@ -0,0 +1,43 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.IndexOutput; + +abstract class PositionsConsumer { + + abstract void start(IndexOutput termsOut) throws IOException; + + abstract void startTerm() throws IOException; + + /** Add a new position & payload. If payloadLength > 0 + * you must read those bytes from the IndexInput. NOTE: + * you must fully consume the byte[] payload, since + * caller is free to reuse it on subsequent calls. */ + abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; + + /** Called when we are done adding positions & payloads + * for each doc */ + abstract void finishDoc() throws IOException; + + abstract void finishTerm(boolean isIndexTerm) throws IOException; + + abstract void close() throws IOException; +} Property changes on: src/java/org/apache/lucene/index/PositionsConsumer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/PositionsEnum.java =================================================================== --- src/java/org/apache/lucene/index/PositionsEnum.java (revision 0) +++ src/java/org/apache/lucene/index/PositionsEnum.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +abstract class PositionsEnum { + + /** Returns the next position. You should only call this + * up to {@link FormatPostingsDocsEnum#freq()} times else + * the behavior is not defined. */ + abstract int next() throws IOException; + + abstract int getPayloadLength(); + + // nocommit -- improve this so that readers that do their + // own buffering can save a copy + abstract byte[] getPayload(byte[] data, int offset) throws IOException; + + abstract boolean hasPayload(); +} Index: src/java/org/apache/lucene/index/SegmentWriteState.java =================================================================== --- src/java/org/apache/lucene/index/SegmentWriteState.java (revision 718730) +++ src/java/org/apache/lucene/index/SegmentWriteState.java (working copy) @@ -26,17 +26,38 @@ DocumentsWriter docWriter; Directory directory; String segmentName; + FieldInfos fieldInfos; String docStoreSegmentName; int numDocs; - int termIndexInterval; int numDocsInStore; Collection flushedFiles; - public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs, + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int termIndexInterval; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, FieldInfos fieldInfos, + String docStoreSegmentName, int numDocs, int numDocsInStore, int termIndexInterval) { this.docWriter = docWriter; this.directory = directory; this.segmentName = segmentName; + this.fieldInfos = fieldInfos; this.docStoreSegmentName = docStoreSegmentName; this.numDocs = numDocs; this.numDocsInStore = numDocsInStore; Index: src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- src/java/org/apache/lucene/util/ArrayUtil.java (revision 718730) +++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -110,6 +110,29 @@ return array; } + public static char[] grow(char[] array, int minSize) { + if (array.length < minSize) { + char[] newArray = new char[getNextSize(minSize)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static char[] grow(char[] array) { + return grow(array, 1+array.length); + } + + public static char[] shrink(char[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize); + if (newSize != array.length) { + char[] newArray = new char[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + /** Returns hash of chars in range start (inclusive) to * end (inclusive) */ public static int hashCode(char[] array, int start, int end) { Index: src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- src/java/org/apache/lucene/util/UnicodeUtil.java (revision 718730) +++ src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -77,11 +77,8 @@ public int length; public void setLength(int newLength) { - if (result.length < newLength) { - byte[] newArray = new byte[(int) (1.5*newLength)]; - System.arraycopy(result, 0, newArray, 0, length); - result = newArray; - } + if (result.length < newLength) + result = ArrayUtil.grow(result, newLength); length = newLength; } } @@ -92,11 +89,8 @@ public int length; public void setLength(int newLength) { - if (result.length < newLength) { - char[] newArray = new char[(int) (1.5*newLength)]; - System.arraycopy(result, 0, newArray, 0, length); - result = newArray; - } + if (result.length < newLength) + result = ArrayUtil.grow(result, newLength); length = newLength; } @@ -104,6 +98,13 @@ setLength(other.length); System.arraycopy(other.result, 0, result, 0, length); } + + public void copyText(String other) { + final int otherLength = other.length(); + setLength(otherLength); + other.getChars(0, otherLength, result, 0); + length = otherLength; + } } /** Encode characters from a char[] source, starting at Index: contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java =================================================================== --- contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (revision 718730) +++ contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (working copy) @@ -318,7 +318,7 @@ if (useMemIndex && useRAMIndex) { if (verbose) System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2); if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) { - throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); + throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer + " score1=" + score1 + " score2=" + score2); } } }