Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 594476) +++ CHANGES.txt (working copy) @@ -55,6 +55,12 @@ scheduler is now ConcurrentMergeScheduler (see LUCENE-870). (Steven Parkes via Mike McCandless) + 6. LUCENE-1052: Add IndexReader.setTermInfosIndexDivisor(int) method + that allows you to reduce memory usage of the termInfos by further + sub-sampling (over the termIndexInterval that was used during + indexing) which terms are loaded into memory. (Chuck Williams, + Doug Cutting via Mike McCandless) + Bug fixes 1. LUCENE-933: QueryParser fixed to not produce empty sub Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 594476) +++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -19,12 +19,14 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.IOException; +import org.apache.lucene.search.Similarity; public class TestSegmentTermDocs extends LuceneTestCase { private Document testDoc = new Document(); @@ -46,8 +48,13 @@ } public void testTermDocs() throws IOException { + testTermDocs(1); + } + + public void testTermDocs(int indexDivisor) throws IOException { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(info); + reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); @@ -63,9 +70,14 @@ } public void testBadSeek() throws IOException { + testBadSeek(1); + } + + public void testBadSeek(int indexDivisor) throws IOException { { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(info); + reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); @@ -76,6 +88,7 @@ { //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(info); + reader.setTermInfosIndexDivisor(indexDivisor); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); @@ -86,6 +99,10 @@ } public void testSkipTo() throws IOException { + testSkipTo(1); + } + + public void testSkipTo(int indexDivisor) throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); @@ -106,6 +123,9 @@ writer.close(); IndexReader reader = IndexReader.open(dir); + reader.setTermInfosIndexDivisor(indexDivisor); + assertEquals(indexDivisor, reader.getTermInfosIndexDivisor()); + TermDocs tdocs = reader.termDocs(); // without optimization (assumption skipInterval == 16) @@ -209,6 +229,31 @@ dir.close(); } + public void testIndexDivisor() throws IOException { + dir = new MockRAMDirectory(); + testDoc = new Document(); + DocHelper.setupDoc(testDoc); + DocHelper.writeDoc(dir, testDoc); + testTermDocs(2); + testBadSeek(2); + testSkipTo(2); + } + + public void testIndexDivisorAfterLoad() throws IOException { + dir = new MockRAMDirectory(); + testDoc = new Document(); + DocHelper.setupDoc(testDoc); + SegmentInfo si = DocHelper.writeDoc(dir, testDoc); + SegmentReader reader = SegmentReader.get(si); + assertEquals(1, reader.docFreq(new Term("keyField", "Keyword"))); + try { + reader.setTermInfosIndexDivisor(2); + fail("did not hit IllegalStateException exception"); + } catch (IllegalStateException ise) { + // expected + } + } + private void addDoc(IndexWriter writer, String value) throws IOException { Document doc = new Document(); Index: src/test/org/apache/lucene/index/TestSegmentReader.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentReader.java (revision 594476) +++ src/test/org/apache/lucene/index/TestSegmentReader.java (working copy) @@ -28,6 +28,9 @@ import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.search.Similarity; public class TestSegmentReader extends LuceneTestCase { private RAMDirectory dir = new RAMDirectory(); @@ -204,4 +207,19 @@ assertTrue("We do not have 4 term freq vectors, we have: " + results.length, results.length == 4); } + public void testIndexDivisor() throws IOException { + dir = new MockRAMDirectory(); + testDoc = new Document(); + DocHelper.setupDoc(testDoc); + SegmentInfo si = DocHelper.writeDoc(dir, testDoc); + + reader = SegmentReader.get(si); + reader.setTermInfosIndexDivisor(3); + testDocument(); + testDelete(); + testGetFieldNameVariations(); + testNorms(); + testTerms(); + testTermVectors(); + } } Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 594476) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -444,6 +444,14 @@ return si.docCount; } + public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { + tis.setIndexDivisor(indexDivisor); + } + + public int getTermInfosIndexDivisor() { + return tis.getIndexDivisor(); + } + /** * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption) */ Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 594477) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -40,6 +40,9 @@ private long[] indexPointers; private SegmentTermEnum indexEnum; + + private int indexDivisor = 1; + private int totalIndexInterval; TermInfosReader(Directory dir, String seg, FieldInfos fis) throws CorruptIndexException, IOException { @@ -58,6 +61,7 @@ origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis", readBufferSize), fieldInfos, false); size = origEnum.size; + totalIndexInterval = origEnum.indexInterval; indexEnum = new SegmentTermEnum(directory.openInput(segment + ".tii", readBufferSize), fieldInfos, true); @@ -82,6 +86,43 @@ public int getMaxSkipLevels() { return origEnum.maxSkipLevels; } + + /** + *
Sets the indexDivisor, which subsamples the number + * of indexed terms loaded into memory. This has a + * similar effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1.
+ * + * NOTE: you must call this before the term + * index is loaded. If the index is already loaded, + * an IllegalStateException is thrown. + * + + @throws IllegalStateException if the term index has + * already been loaded into memory. + */ + public void setIndexDivisor(int indexDivisor) throws IllegalStateException { + if (indexDivisor < 1) + throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor); + + if (indexTerms != null) + throw new IllegalStateException("index terms are already loaded"); + + this.indexDivisor = indexDivisor; + totalIndexInterval = origEnum.indexInterval * indexDivisor; + } + + /** Returns the indexDivisor. + * @see #setIndexDivisor + */ + public int getIndexDivisor() { + return indexDivisor; + } final void close() throws IOException { if (origEnum != null) @@ -106,10 +147,10 @@ } private synchronized void ensureIndexIsRead() throws IOException { - if (indexTerms != null) // index already read - return; // do nothing + if (indexTerms != null) // index already read + return; // do nothing try { - int indexSize = (int)indexEnum.size; // otherwise read index + int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; @@ -119,6 +160,10 @@ indexTerms[i] = indexEnum.term(); indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; + + for (int j = 1; j < indexDivisor; j++) + if (!indexEnum.next()) + break; } } finally { indexEnum.close(); @@ -146,8 +191,8 @@ private final void seekEnum(int indexOffset) throws IOException { getEnum().seek(indexPointers[indexOffset], - (indexOffset * getEnum().indexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); + (indexOffset * totalIndexInterval) - 1, + indexTerms[indexOffset], indexInfos[indexOffset]); } /** Returns the TermInfo for a Term in the set, or null. */ @@ -161,7 +206,7 @@ if (enumerator.term() != null // term is at or past current && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { - int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1; + int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block || term.compareTo(indexTerms[enumOffset]) < 0) return scanEnum(term); // no need to seek @@ -189,10 +234,10 @@ SegmentTermEnum enumerator = getEnum(); if (enumerator != null && enumerator.term() != null && position >= enumerator.position && - position < (enumerator.position + enumerator.indexInterval)) + position < (enumerator.position + totalIndexInterval)) return scanEnum(position); // can avoid seek - seekEnum(position / enumerator.indexInterval); // must seek + seekEnum(position/totalIndexInterval); // must seek return scanEnum(position); } Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 594476) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -298,6 +298,35 @@ throw new UnsupportedOperationException("This reader does not support this method."); } + /**For IndexReader implementations that use + * TermInfosReader to read terms, this sets the + * indexDivisor to subsample the number of indexed terms + * loaded into memory. This has the same effect as {@link + * IndexWriter#setTermIndexInterval} except that setting + * must be done at indexing time while this setting can be + * set per reader. When set to N, then one in every + * N*termIndexInterval terms in the index is loaded into + * memory. By setting this to a value > 1 you can reduce + * memory usage, at the expense of higher latency when + * loading a TermInfo. The default value is 1.
+ * + * NOTE: you must call this before the term + * index is loaded. If the index is already loaded, + * an IllegalStateException is thrown. + * @throws IllegalStateException if the term index has already been loaded into memory + */ + public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { + throw new UnsupportedOperationException("This reader does not support this method."); + } + + /**For IndexReader implementations that use + * TermInfosReader to read terms, this returns the + * current indexDivisor. + * @see #setTermInfosIndexDivisor */ + public int getTermInfosIndexDivisor() { + throw new UnsupportedOperationException("This reader does not support this method."); + } + /** * Check whether this IndexReader is still using the * current (i.e., most recently committed) version of the Index: src/java/org/apache/lucene/index/MultiSegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/MultiSegmentReader.java (revision 594476) +++ src/java/org/apache/lucene/index/MultiSegmentReader.java (working copy) @@ -299,7 +299,18 @@ return fieldSet; } + public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { + for (int i = 0; i < subReaders.length; i++) + subReaders[i].setTermInfosIndexDivisor(indexDivisor); + } + public int getTermInfosIndexDivisor() throws IllegalStateException { + if (subReaders.length > 0) + return subReaders[0].getTermInfosIndexDivisor(); + else + throw new IllegalStateException("no readers"); + } + static class MultiTermEnum extends TermEnum { private SegmentMergeQueue queue;