Index: src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (revision 563241) +++ src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (working copy) @@ -190,6 +190,41 @@ writer.addDocument(doc); } + public String segString(IndexWriter writer) { + + SegmentInfos segmentInfos = writer.segmentInfos; + + StringBuffer buffer = new StringBuffer(); + + for(int i = 0; i < segmentInfos.size(); i++) { + + if (i > 0) { + buffer.append(' '); + } + + SegmentInfo info = segmentInfos.info(i); + + try { + if (info.getUseCompoundFile()) { + buffer.append('c'); + } else { + buffer.append('C'); + } + } catch (Exception e) { + } + + if (info.dir != writer.getDirectory()) { + buffer.append('x'); + } + + buffer.append(info.docCount); + + } + + return buffer.toString(); + + } + private void checkInvariants(IndexWriter writer) throws IOException { int maxBufferedDocs = writer.getMaxBufferedDocs(); int mergeFactor = writer.getMergeFactor(); @@ -198,30 +233,31 @@ int ramSegmentCount = writer.getNumBufferedDocuments(); assertTrue(ramSegmentCount < maxBufferedDocs); - int lowerBound = -1; - int upperBound = maxBufferedDocs; + int previousBound = -1; + int currentBound = maxBufferedDocs; int numSegments = 0; int segmentCount = writer.getSegmentCount(); for (int i = segmentCount - 1; i >= 0; i--) { int docCount = writer.getDocCount(i); - assertTrue(docCount > lowerBound); - if (docCount <= upperBound) { + assertTrue(docCount > previousBound); + + if (docCount <= currentBound) { numSegments++; } else { - if (upperBound * mergeFactor <= maxMergeDocs) { + if (currentBound * mergeFactor <= maxMergeDocs) { assertTrue(numSegments < mergeFactor); } do { - lowerBound = upperBound; - upperBound *= mergeFactor; - } while (docCount > upperBound); + previousBound = currentBound; + currentBound *= mergeFactor; + } while (docCount > currentBound); numSegments = 1; } } - if (upperBound * mergeFactor <= maxMergeDocs) { + if (currentBound * mergeFactor <= maxMergeDocs) { assertTrue(numSegments < mergeFactor); } Index: src/test/org/apache/lucene/index/TestLogDocMergePolicy.java =================================================================== --- src/test/org/apache/lucene/index/TestLogDocMergePolicy.java (revision 0) +++ src/test/org/apache/lucene/index/TestLogDocMergePolicy.java (revision 0) @@ -0,0 +1,214 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import junit.framework.TestCase; + +public class TestLogDocMergePolicy extends TestCase { + + void assertNotEquals(Object a, Object b) { + assertFalse(a.equals(b)); + } + + MockIndexMerger merger; + LogDocMergePolicy policy; + + void set(String sizes) { + merger = new MockIndexMerger (sizes); + policy = new LogDocMergePolicy(); + policy.setMaxMergeDocs(100000); + } + + void merge(String before, String after, int mergeFactor, int minMergeDocs) + throws CorruptIndexException, IOException { + set(before); + merger.setMaxBufferedDocs(minMergeDocs); + policy.setMergeFactor(mergeFactor); + policy.merge(merger.segmentInfos, merger); + MockIndexMerger other = new MockIndexMerger(after); + assertEquals(other.toString(), merger.toString()); + } + + void merge(String before, String after, int mergeFactor) + throws CorruptIndexException, IOException { + merge(before, after, mergeFactor, MockIndexMerger.DEFAULT_MAX_BUFFERED_DOCS); + + } + + void merge(String before, String after) + throws CorruptIndexException, IOException { + merge(before, + after, + LogDocMergePolicy.DEFAULT_MERGE_FACTOR, + MockIndexMerger.DEFAULT_MAX_BUFFERED_DOCS); + } + + public void testOneMerge() + throws CorruptIndexException, IOException { + merge("1000 10 0 10", "1000 10 10", 4, 4); + } + + public void testConsistentMerge() + throws CorruptIndexException, IOException { + + MockIndexMerger other; + + merge("0 10", "0 10"); + merge("1000 10 0 10", "1000 10 10", 4, 4); + merge("1000 0 0 10 0 0 10", "1000 10 0 10", 4); + merge("1000 100 10 1 1 1 1 1 1 1 1 1 1 1 1 1 1", "1000 100 19 1 1 1 1 1"); + merge("1000 100 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0", "1000 100 10 0 0 0 0 0"); + merge("9 9 9 9 9 9 9 9 9 9","90"); + merge("10 10 10 10 10 10 10 10 10 10", "100"); + merge("100 100 100 100 100 100 100 100 100 100", "1000"); + merge("1000 10 x10 x10 x10", "1000 10 10 10 10"); + merge("1000 9 x9 x9 x9", "1000 9 9 9 9"); + merge("1000 10 x10 x10 x10", "1000 40", 4); + merge("1000 9 x9 x9 x9", "1000 36", 4); + merge("1000 0 10 0 10", "1000 20", 4); + + } + + public void testInconsistentMerge() + throws CorruptIndexException, IOException { + + merge("1001 101 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1001", "1120 1008"); + merge("1001 101 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11", "1001 101 20 16"); + + } + + public void testChangeMergeFactor() + throws CorruptIndexException, IOException { + + MockIndexMerger other; + + set("11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11"); + policy.setMergeFactor(5); + policy.merge(merger.segmentInfos, merger); + other = new MockIndexMerger("275 11"); + assertEquals(other.toString(), merger.toString()); + + set("0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11"); + policy.setMergeFactor(5); + policy.merge(merger.segmentInfos, merger); + other = new MockIndexMerger("11"); + assertEquals(other.toString(), merger.toString()); + + set("0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10"); + policy.setMergeFactor(5); + policy.merge(merger.segmentInfos, merger); + other = new MockIndexMerger("0 10"); + assertEquals(other.toString(), merger.toString()); + } + + public void testOptimize() + throws CorruptIndexException, IOException { + + MockIndexMerger other; + + set ("1000 100 10 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1000"); + + other = new MockIndexMerger("2124"); + policy.optimize(merger.segmentInfos, merger); + assertEquals(other.toString(), merger.toString()); + + set ("x1000"); + + other = new MockIndexMerger("1000"); + policy.optimize(merger.segmentInfos, merger); + assertEquals(other.toString(), merger.toString()); + + set ("1000 100 10 1"); + + other = new MockIndexMerger("1111"); + policy.optimize(merger.segmentInfos, merger); + assertEquals(other.toString(), merger.toString()); + + } + + public void testDirs() + throws CorruptIndexException, IOException { + + MockIndexMerger other; + + set ("100 10 1"); + + other = new MockIndexMerger("100 10 1"); + assertEquals(other.toString(), merger.toString()); + + other = new MockIndexMerger("100 x10 1"); + assertNotEquals(other.toString(), merger.toString()); + + set ("100 x10 1"); + policy.merge(merger.segmentInfos, merger); + + other = new MockIndexMerger("100 10 1"); + assertEquals(other.toString(), merger.toString()); + + } + + public void testInconsistent() { + + set("1001 1000 1000 101 100 11 1"); + assertTrue(policy.isConsistent(merger.segmentInfos, merger)); + + set("1001 1000 1001 1000 101 11 1"); + assertFalse(policy.isConsistent(merger.segmentInfos, merger)); + + set("1001 1000 1001 101 1000 101 11 1"); + assertFalse(policy.isConsistent(merger.segmentInfos, merger)); + + set("1001 11 101 1"); + assertFalse(policy.isConsistent(merger.segmentInfos, merger)); + + set("1001 101 11"); + assertTrue(policy.isConsistent(merger.segmentInfos, merger)); + + set("100001 10001 1001 101 11"); + try { + policy.isConsistent(merger.segmentInfos, merger); + fail(); + } catch (IllegalArgumentException e) { + } + + } + + public void testLowestConsistentBound() { + + set("101 101 1001 101 1"); + assertTrue(policy.lowestConsistentBound(merger.segmentInfos, merger) == + 10000); + + set("10000 1000 100 10 1 100 10 1"); + // System.err.println("? " + policy.lowestConsistentBound(merger.segmentInfos)); + assertTrue(policy.lowestConsistentBound(merger.segmentInfos, merger) == + 100); + + set("1000 1000 100 10 1 1000"); + assertTrue(policy.lowestConsistentBound(merger.segmentInfos, merger) == + 1000); + + set("1000 100 10 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1000"); + assertTrue(policy.lowestConsistentBound(merger.segmentInfos, merger) == + 1000); + + } + +} Property changes on: src/test/org/apache/lucene/index/TestLogDocMergePolicy.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/test/org/apache/lucene/index/MockIndexMerger.java =================================================================== --- src/test/org/apache/lucene/index/MockIndexMerger.java (revision 0) +++ src/test/org/apache/lucene/index/MockIndexMerger.java (revision 0) @@ -0,0 +1,264 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockRAMDirectory; + +import java.util.Map; +import java.util.HashMap; +import java.io.IOException; + +class MockIndexMerger implements IndexMerger { + + MergePolicy policy; + + public final static int DEFAULT_MAX_BUFFERED_DOCS = 10; + + private int maxBufferedDocs = DEFAULT_MAX_BUFFERED_DOCS; + + public void setMaxBufferedDocs(int maxBufferedDocs) { + this.maxBufferedDocs = maxBufferedDocs; + } + + public int getMaxBufferedDocs() { + return maxBufferedDocs; + } + + private double ramBufferSizeMB = 0; + + public void setRAMBufferSizeMB(double ramBufferSizeMB) { + this.ramBufferSizeMB = ramBufferSizeMB; + } + + public double getRAMBufferSizeMB() { + return ramBufferSizeMB; + } + + void setMergePolicy(MergePolicy policy) { + this.policy = policy; + } + + public synchronized void optimize() + throws CorruptIndexException, IOException { + policy.optimize(segmentInfos, this); + } + + public synchronized void merge() + throws CorruptIndexException, IOException { + policy.merge(segmentInfos, this); + } + + MockIndexMerger(String string) { + + String[] strings = string.split("\\s"); + + segmentInfos = new SegmentInfos(); + + for(int i = 0; i < strings.length; i++) { + + StringBuffer buffer = new StringBuffer(strings[i]); + + boolean isCompoundFile = true; + boolean hasSingleNormsFile = true; + String dir = ""; + + while (buffer.charAt(0) < '0' || buffer.charAt(0) > '9') { + + char c = buffer.charAt(0); + buffer.deleteCharAt(0); + + switch(c) { + case 'c': isCompoundFile = true; break; + case 'C': isCompoundFile = false; break; + case 'x': dir = "x"; break; + case 'y': dir = "y"; break; + case 'z': dir = "z"; break; + } + + } + + int size = Integer.parseInt(buffer.toString()); + + SegmentInfo info = new SegmentInfo("name" + i, + size, + the(dir)); + info.setUseCompoundFile(isCompoundFile); + segmentInfos.addElement(info); + + } + } + + public void add(MockIndexMerger other) { + add(copy(other.segmentInfos)); + } + + protected void merge(SegmentInfo segment) { + } + + void add(SegmentInfos segments) { + synchronized(segmentInfos) { + segmentInfos.addAll(segments); + } + } + + static SegmentInfos copy(SegmentInfos segmentInfos) { + SegmentInfos segments = new SegmentInfos(); + synchronized(segmentInfos) { + segments.addAll(segmentInfos); + } + return segments; + } + + void replace(MergePolicy.MergeSpecification spec, SegmentInfo info) { + synchronized(segmentInfos) { + + SegmentInfos segments = spec.segments; + + for(int i = 0; i < segments.size(); i++) { + + int index = segmentInfos.indexOf(segments.info(i)); + + if (index < 0) { + throw new RuntimeException("could not replace segmentInfo"); + } + + if (i == 0) { + segmentInfos.set(index, info); + } else { + segmentInfos.removeElementAt(index); + } + + } + + } + } + + public int merge(MergePolicy.MergeSpecification spec) + throws CorruptIndexException, IOException { + + SegmentInfo info = new SegmentInfo("name", 0, directory); + info.setUseCompoundFile(spec.useCompoundFile); + + SegmentInfos segments = copy(spec.segments); + + int docCount = 0; + + for(int i = 0; i < segments.size(); i++) { + merge(segments.info(i)); + docCount += segments.info(i).docCount; + } + + info.docCount = docCount; + + replace(spec, info); + + return docCount; + } + + public Directory getDirectory() { + return directory; + } + + SegmentInfos segmentInfos; + Directory directory = the(); + + public String segString(SegmentInfos segments, + int first, + int beyond) { + + segments = copy(segments); + + StringBuffer buffer = new StringBuffer(); + + for(int i = first; i < beyond; i++) { + + if (i > first) { + buffer.append(' '); + } + + buffer.append('['); + buffer.append(i); + buffer.append(']'); + + SegmentInfo info = segments.info(i); + + try { + if (info.getUseCompoundFile()) { + buffer.append('c'); + } else { + buffer.append('C'); + } + } catch (Exception e) { + } + + if (info.dir != getDirectory()) { + buffer.append('x'); + } + + buffer.append(info.docCount); + + } + + return buffer.toString(); + + } + + public String segString(SegmentInfos segments) { + synchronized(segments) { + return segString(segments, 0, segments.size()); + } + } + + public String segString(MergePolicy.MergeSpecification spec) { + return segString(spec.segments); + } + + public String toString() { + return segString(segmentInfos); + } + + // utilities for managing a bunch of dirs + + static Map instances = new HashMap(); + + String name; + + static MockRAMDirectory the(String name) { + try { + if (!instances.containsKey(name)) { + instances.put(name, new MockRAMDirectory()); + } + return (MockRAMDirectory)instances.get(name); + } catch ( IOException ioe ) { + return null; + } + } + + static MockRAMDirectory the() { + return the(""); + } + + public String[] _list() { + return new String[0]; + } + +} + Property changes on: src/test/org/apache/lucene/index/MockIndexMerger.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/test/org/apache/lucene/index/TestMockIndexMerger.java =================================================================== --- src/test/org/apache/lucene/index/TestMockIndexMerger.java (revision 0) +++ src/test/org/apache/lucene/index/TestMockIndexMerger.java (revision 0) @@ -0,0 +1,89 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import junit.framework.TestCase; + +public class TestMockIndexMerger extends TestCase { + + void assertNotEquals(Object a, Object b) { + assertFalse(a.equals(b)); + } + + public void testMockIndexMerger() { + + assertEquals(new MockIndexMerger("1001 101 11 1").toString(), + new MockIndexMerger("1001 101 11 1").toString()); + + assertNotEquals(new MockIndexMerger("1001 101 11 1").toString(), + new MockIndexMerger("1001 101 11 11").toString()); + + } + + SegmentInfos segmentInfos(SegmentInfos all, int first, int last) { + SegmentInfos subset = new SegmentInfos(); + subset.addAll(all.subList(first, last)); + return subset; + } + + public void testUseCompoundFile() throws IOException { + + MockIndexMerger merger = new MockIndexMerger("c1001 C101"); + + assertTrue(merger.segmentInfos.info(0).getUseCompoundFile()); + assertFalse(merger.segmentInfos.info(1).getUseCompoundFile()); + + MergePolicy.MergeSpecification spec = new MergePolicy.MergeSpecification(); + spec.segments = segmentInfos(merger.segmentInfos, 0, 1); + spec.useCompoundFile = true; + + merger.merge(spec); + + assertTrue(merger.segmentInfos.info(0).getUseCompoundFile()); + assertFalse(merger.segmentInfos.info(1).getUseCompoundFile()); + + spec.segments = segmentInfos(merger.segmentInfos, 0, 1); + spec.useCompoundFile = false; + + merger.merge(spec); + + assertFalse(merger.segmentInfos.info(0).getUseCompoundFile()); + assertFalse(merger.segmentInfos.info(1).getUseCompoundFile()); + + spec.segments = segmentInfos(merger.segmentInfos, 1, 2); + spec.useCompoundFile = false; + + merger.merge(spec); + + assertFalse(merger.segmentInfos.info(0).getUseCompoundFile()); + assertFalse(merger.segmentInfos.info(1).getUseCompoundFile()); + + spec.segments = segmentInfos(merger.segmentInfos, 1, 2); + spec.useCompoundFile = true; + + merger.merge(spec); + + assertFalse(merger.segmentInfos.info(0).getUseCompoundFile()); + assertTrue(merger.segmentInfos.info(1).getUseCompoundFile()); + + + } + +} Property changes on: src/test/org/apache/lucene/index/TestMockIndexMerger.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/test/org/apache/lucene/index/TestIndexWriterDelete.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterDelete.java (revision 563241) +++ src/test/org/apache/lucene/index/TestIndexWriterDelete.java (working copy) @@ -32,9 +32,17 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.store.RAMDirectory; +import java.util.List; public class TestIndexWriterDelete extends TestCase { + class KeepAllDeletionPolicy implements IndexDeletionPolicy { + public void onInit(List commits) { + } + public void onCommit(List commits) { + } + } + // test the simple case public void testSimpleCase() throws IOException { String[] keywords = { "1", "2" }; @@ -445,15 +453,18 @@ } // Whether we succeeded or failed, check that all - // un-referenced files were in fact deleted (ie, - // we did not create garbage). Just create a - // new IndexFileDeleter, have it delete - // unreferenced files, then verify that in fact - // no files were deleted: + // un-referenced files were in fact deleted (ie, we did not + // create garbage). First we write the current segmentInfos + // out to disk. This will make sure the deleter doesn't + // delete files related to the current, unwritten + // state. Then create a new IndexFileDeleter with a keep all + // policy and have it delete unreferenced files and verify + // that in fact no files were deleted: + modifier.segmentInfos.write(dir); String[] startFiles = dir.list(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + IndexFileDeleter d = new IndexFileDeleter(dir, new KeepAllDeletionPolicy(), infos, null , null); String[] endFiles = dir.list(); Arrays.sort(startFiles); @@ -496,6 +507,7 @@ Hits hits = null; try { hits = searcher.search(new TermQuery(searchTerm)); + // System.err.println( "hits after: " + hits.length() ); } catch (IOException e) { e.printStackTrace(); @@ -503,10 +515,16 @@ } int result2 = hits.length(); if (success) { - if (result2 != END_COUNT) { + // If we succeded, then we are in the second pass and have + // the results from the first. The results will depend on + // when the failure occured in the previous pass. All we + // guaranttee is that we will have either have seen all + // the deletes or udpates or none of them. + if (result2 != START_COUNT && result2 != END_COUNT) { + // System.err.println( "*Q1 " + autoCommit + " " + updates + " **" ); fail(testName + ": method did not throw exception but hits.length for search on term 'aaa' is " - + result2 + " instead of expected " + END_COUNT); + + result2 + " instead of expected " + START_COUNT + " or " + END_COUNT); } } else { // On hitting exception we still may have added Index: src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java =================================================================== --- src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 563241) +++ src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (working copy) @@ -272,7 +272,8 @@ writer.addIndexesNoOptimize(new Directory[] { aux, aux }); assertEquals(1020, writer.docCount()); - assertEquals(2, writer.getSegmentCount()); + /* note this is different than the 2.2 merge because of corner cases in the default policy */ + assertEquals(3, writer.getSegmentCount()); assertEquals(1000, writer.getDocCount(0)); writer.close(); Index: src/test/org/apache/lucene/index/TestIndexModifier.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexModifier.java (revision 563241) +++ src/test/org/apache/lucene/index/TestIndexModifier.java (working copy) @@ -193,7 +193,7 @@ } public int docFreq(Term term) throws IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexReader(); return indexReader.docFreq(term); } Index: src/java/org/apache/lucene/index/LogDocMergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/LogDocMergePolicy.java (revision 0) +++ src/java/org/apache/lucene/index/LogDocMergePolicy.java (revision 0) @@ -0,0 +1,751 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Vector; + +/** + * Logarithmic merge policy + * + */ + +/* Logarithmic merge policy + * + * The LMP maintains segments as increasing (from the right) groups of + * segments where the number of documents in each segment increases + * exponentially. + * + * The LMP has three parameters: + * + * minMergeDocs: this is the lowest number of documents in a segment (comes from IndexWriter) + * mergeFactor: this is the base of the logarithm + * maxMergeDocs: this is used to compute the bound on the segment size + * + * Segments are conceptually assigned a level based on the number of + * documents they contain: + * + * level = ceiling(logM(ceiling(docs / minMergeDocs))) + * + * logM: log-base mergeFactor + * docs = number of documents in the segment + * + * In the simple case, where no deletes are occurring, when a new + * segment of level N is created and M-1 segments already exist at + * that level, a new merge will be initiated to replace the now M + * segments at level N with a new segment at level N+1. + * + * Notes: + * + * The number of documents does not (currently) consider latent + * deleted documents. + * + * The number of documents that a merge produces is dependent on + * latent deleted documents in the segments making up the + * merge and the result of a merge may result in segment list that + * does not maintain the desired logarithmic invariant. + * + */ + +class LogDocMergePolicy implements MergePolicy { + + /** + * Default value is 10. Change using {@link #setMergeFactor(int)}. + */ + public final static int DEFAULT_MERGE_FACTOR = 10; + + /** + * Default value is {@link Integer#MAX_VALUE}. Change using {@link #setMaxMergeDocs(int)}. + */ + public final static int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; + + /** Use compound file setting. Defaults to true, minimizing the number of + * files used. Setting this to false may improve indexing performance, but + * may also cause file handle problems. + */ + private boolean useCompoundFile = true; + + public LogDocMergePolicy() { + } + + public void close() {} + + /** Setting to turn on usage of a compound file. When on, multiple files + * for each segment are merged into a single file once the segment creation + * is finished. This is done regardless of what directory is in use. + */ + public void setUseCompoundFile(boolean useCompoundFile) { + this.useCompoundFile = useCompoundFile; + } + + /** See {@link #getUseCompountFile(boolean)} + */ + public boolean getUseCompoundFile() { + return useCompoundFile; + } + + /** Called by writers/mergers to determine whether newly created + * segments should be in compound file format or not. + */ + public boolean useCompoundFile(SegmentInfos segments, + SegmentInfo newSegment) { + return useCompoundFile; + } + + private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; + + /** Determines the largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + *
The default value is {@link Integer#MAX_VALUE}. + */ + public void setMaxMergeDocs(int maxMergeDocs) { + if (maxMergeDocs < 2) + throw new IllegalArgumentException("maxMergeDocs must at least be 2"); + this.maxMergeDocs = maxMergeDocs; + } + + /** See {@link #setMaxMergeDocs(int)} + */ + public int getMaxMergeDocs() { + return maxMergeDocs; + } + + private int mergeFactor = DEFAULT_MERGE_FACTOR; + + /** Determines how often segment indices are merged during addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on unoptimized + * indices are slower, indexing is faster. Thus larger values (> 10) are best + * for batch index creation, and smaller values (< 10) for indices that are + * interactively maintained. + * + *
This must never be less than 2. The default value is 10. + */ + public void setMergeFactor(int mergeFactor) { + if (mergeFactor < 2) + throw new IllegalArgumentException("mergeFactor must at least be 2"); + this.mergeFactor = mergeFactor; + } + + /** + * @see #setMergeFactor + */ + public int getMergeFactor() { + return mergeFactor; + } + + /* a ConsistencyState object wraps the merge invariants: + * + * 1) that the list of segments, viewed from the end, is a + * monotonically increasing in level (the log of the number of + * documents (not including deleted documents)). + * + * 2) All segments are owned by the destination directory. + * + * Inconsistencies are measured in three ways: + * + * lowestConsistentBound: if not zero, it is the lowest logarithmic + * level above which logarithmic monotonicity holds. E.g., (assuming + * minMergeDocs = mergeFactor = 10 + * LCB(100 10 1) == 0 + * LCB(100 1 10) == 100 + * LCB(1 10 100) == 1000 + * + * firstInconsistentSegment: when the invariants aren't met, the + * firstInconsistentSegment is the index of, well, the first + * inconsistent segment, e.g., (same givens as above) + * FIS(100 1 10) = 1 + * + * inconsistentDirectories: a list of directories not contained by + * the merge which need to be copied if not otherwise merged + * + * Notes: + * + * Consistency in this context does not include checking that the number of segments at a level is <= M + * but the merge policy will still consolidate those. + * + */ + + class ConsistencyState { + + Vector inconsistentDirectories = new Vector(); + int lowestConsistentBound = 0; + int firstInconsistentSegment = -1; + + ConsistencyState(SegmentInfos segmentInfos, IndexMerger merger) { + + int previousBound = -1; + int currentBound = merger.getMaxBufferedDocs(); + + /* synchronized(segmentInfos) */ { + + for (int i = segmentInfos.size()-1; i >=0; i--) { + + SegmentInfo info = segmentInfos.info(i); + + if (info.dir != merger.getDirectory()) { + inconsistentDirectories.add(0, new Integer(i)); + } + + int docCount = info.docCount; + + if (docCount <= previousBound) { + lowestConsistentBound = currentBound; + firstInconsistentSegment = i; + } + + while (docCount > currentBound) { + + previousBound = currentBound; + currentBound *= mergeFactor; + + if (currentBound > maxMergeDocs) { + throw new IllegalArgumentException("No segment size can exceed maxMergeDocs"); + } + } + + } + + } + + } + + boolean isConsistent() { + return lowestConsistentBound == 0 && inconsistentDirectories.isEmpty(); + } + + } + + int lowestConsistentBound(SegmentInfos segmentInfos, IndexMerger merger) { + return new ConsistencyState(segmentInfos, merger).lowestConsistentBound; + } + + protected int merge(MergeSpecification spec, IndexMerger merger) + throws MergeException, CorruptIndexException, IOException { + int docCount = merger.merge(spec); + return docCount; + } + + SegmentInfos segmentInfos(SegmentInfos all, int first, int last) { + SegmentInfos subset = new SegmentInfos(); + /* synchronized(all) */ { + subset.addAll(all.subList(first, last)); + } + return subset; + } + + static SegmentInfos copy(SegmentInfos segmentInfos) { + SegmentInfos result = new SegmentInfos(); + /* synchronized(segmentInfos) */ { + result.addAll(segmentInfos); + } + return result; + } + + /* The primary method for handling merges is cascadingMerge. It + * takes a consistent list of segments and generates primitive + * merges until the logarithmic invariants are met. + * + * This function takes two parameters that override the "default" + * behaviour: + * + * firstBound: this parameter effectively overrides minMergeDocs for + * one call, the result being that all segments with less than + * firstBound docs will be considered to be at the same level. This + * behaviour is used when making an inconsistent index consistent: + * cascadingMerge is called with firstBound = leastConsistentBound. + * + * firstSegmentToConsider: this is used to manually set the left + * boundary of the subsequence. It also causes a merge even if the + * number of segments that would be merged is less than + * mergeFactor. It is used as part of the process of making an index + * consistent. + * + * Differences from the non-factored version: + * + * Prcoessing of non-minimum sized segments: this function, rather + * than staring at minimimum sized segments and stopping if none are + * found, looks at the lowest level found and checks it, stopping if + * it is okay. This is designed to complement the consitency checker + * which handles lists where levels are not contiguous but allows + * segments which are over populated (too big) but logartihmically + * monotonic. + * + * Notes on concurrency: if the concurrent merge work goes forward, + * this function needs to be changed structurally. When concurrent + * can occur, segmentInfos can change over time and in a way + * predictable in the current thread. In particular, during a merge, + * segmentInfos can change a lot and thus the loop below which + * expects to do multiple merges based on data gleaned before + * beginning the merges probably needs to change. + */ + + public void cascadingMerge(SegmentInfos segmentInfos, + IndexMerger merger, + int firstBound, + int firstSegmentToConsider) + throws MergeException, CorruptIndexException, IOException { + + long previousBound = -1; + long currentBound = firstBound; + + assert currentBound != 0; + + /* Determine the bounds for the smallest existing segment */ + + int firstCount = firstCount = segmentInfos.info(segmentInfos.size()-1).docCount; + while (firstCount > currentBound) { + previousBound = currentBound; + currentBound *= mergeFactor; + } + + /* find the contigiuos subseguence of segments on this level, + * searching from the right */ + + while (currentBound <= maxMergeDocs) { + + /* starts like this: */ + /* abcdefg */ + /*r l*/ + /* and iterates by pulling the left pointer towards the left */ + + int left = segmentInfos.size(); + int right = -1; + + /* search from the right ... */ + + while(--left >= 0) { + + int docCount = segmentInfos.info(left).docCount; + + /* first time a segment is in bounded, set right */ + + if (right == -1 && docCount > previousBound && docCount <= currentBound) { + right = left; + } + + /* first time a segment is above bound, stop, leaving left where it is + * else continue, which pulls left towards the left */ + + if (docCount > currentBound) { + break; + } + + } + + int first = left + 1; + + if (firstSegmentToConsider >= 0) { + first = firstSegmentToConsider; + } + + int length = right - first + 1; + + /* intoNext level is used to determine how to handle the results + * of a merge. If the merge results in a segment in the next + * level, a cascade will ensue. Otherwise, the merge has created + * a segment at the same level as the pervious segments and is a + * candidate for further merging at that level. + */ + + boolean intoNextLevel = false; + + MergeSpecification spec = new MergeSpecification(); + spec.useCompoundFile = useCompoundFile; + + while (length > 0) { + + int mergeLength = mergeFactor; + + /* Normally, if the length of the subsequence is less than + * mergeFactor, no merge will occur. THis is modified if first + * has been overriden, as described above. + */ + + if (length < mergeLength) { + if (firstSegmentToConsider == -1) { + break; + } + mergeLength = length; + } + + int last = first + mergeLength; + + /* Create the subsquence. Note this creates a new segmentInfos + * (so it's not == to the one owned by the merger). This + * should help in concurrency but is not alone sufficient. + */ + + spec.segments = segmentInfos(segmentInfos, first, last); + + int docCount = 0; + + docCount = merge(spec, merger); + + length -= mergeLength; + + firstSegmentToConsider = -1; + + if (docCount == -1) { + + /* If the merge ends up being concurrent, we don't know the + * number of documents that will result. So we just look at + * the rest of the segments. Note, though, that the bump by + * mergeLength assumes that segmentInfos hasn't been updated + * before this code is run, which is not safe. That's an + * example of where this needs to be tweaked for + * concurrency. Partly this could be handled by going to a + * rescrusvie rather than itteractive structure. But there + * are other issues that needs to be explored / understood ... + */ + + first += mergeLength; + + } else if (docCount > currentBound) { + + /* If the new segment is into the next level, record that + * fact and continue with other segments at this level. */ + + first++; + intoNextLevel = true; + + } else { + + /* The new segment is still in the current level, so just + * add it back to this list of segments on this level */ + + length++; + + } + + } + + /* At this point, there aren't maxMergeSegments left to be + * merged at this level. If at some point we created a segment + * at the next level, we need to check it, so we'll fall through + * the break. Otherwise, the break fires, indicating that no + * further merges are needed to ensure consistency (as long as + * we were already consistent). + */ + if (!intoNextLevel) { + break; + } + + previousBound = currentBound; + currentBound *= mergeFactor; + + } + + } + + public void cascadingMerge(SegmentInfos segmentInfos, + IndexMerger merger, + int firstBound) + throws MergeException, CorruptIndexException, IOException { + cascadingMerge(segmentInfos, merger, firstBound, /* firstSegmentToConsider = */ -1); + } + + public void cascadingMerge(SegmentInfos segmentInfos, IndexMerger merger) + throws MergeException, CorruptIndexException, IOException { + cascadingMerge(segmentInfos, + merger, + /* firstBound = */ merger.getMaxBufferedDocs(), + /* firstSegmentToConsider = */ -1); + } + + public boolean isConsistent(SegmentInfos segmentInfos, + IndexMerger merger) { + return ((new ConsistencyState(segmentInfos, merger)).isConsistent()); + } + + /* Use the merger to copy a squence of segments */ + void copy(SegmentInfos segmentInfos, + IndexMerger merger, + int first, + int last) + throws MergeException, CorruptIndexException, IOException { + + MergeSpecification spec = new MergeSpecification(); + spec.segments = segmentInfos(segmentInfos, first, last); + spec.useCompoundFile = useCompoundFile; + + merge(spec, merger); + + } + + /* This is the "alternative" to cascadingMerge. It is used to take + * an inconsistent list of segments and make them consistent. It + * uses cascadingMerge and copy to do this. + * + * This code is based closely on the non-factored version. + * + * Differences from the non-factored version: + * + * In the non-factored veresion, IndexWriter#addIndexesNoOptimize + * has most of the code for making an index consistent. It's + * necessary in this function since it's purpose is to aggregate a + * number of indexes, in which case it's highlty unlikely the + * resulitng sequnce will follow the logarithmic + * invariants. However, there are other reasons an index will not be + * consistent. It may well not be consistent if parameters + * change. Deletes can also cause merges that result in + * inconsistency (I think). + * + * makeConsistent is supposed to take an inconsistent list and make + * it consistent. It does this without regard for what caused the + * inconsistency. Beacuse it makes fewer assumptions, the algorithm + * is slightly different than addIndexesNoOptimize and the results + * can be different in a few cases (though the invariants will still + * hold). + */ + + boolean makeConsistent(SegmentInfos segmentInfos, IndexMerger merger) + throws MergeException, CorruptIndexException, IOException { + + ConsistencyState state = new ConsistencyState(segmentInfos, merger); + + if (state.isConsistent()) { + return false; + } + + /* If the segment list is not composed of continguous subsequences + * at monotonically increasing levels, use cascadingMerge with + * firstBound == the lowest consistent bound. This will devide + * the list into two pieces: a (possibly empty) consistent set on + * the left and an inconsistent set on the right, e.g., + * + * 1000 1000 10 100 10 + * < con > < incon > + * + * Starting with the left-most inconsistent segment, + * cascadingMerge will merge mergeFactor segments at a time using + * it's normal algorithm for the given level. + */ + + if (state.lowestConsistentBound > 0) { + cascadingMerge(segmentInfos, merger, state.lowestConsistentBound); + state = new ConsistencyState(segmentInfos, merger); + } + + /* cascadingMerge will stop when there are less than + * maxMergeSegments, but there may still be inconsistency in the last + * segments, of which there may be up to mergeFactor - 1 segments. + * For example, we could be left with + * + * 1000 100 100 10 100 + * + * with M = 4. Since the first inconsistency is only one from the end, + * it will not get merged. + * + * We use cascadingMerge again, but force it to merge these + * segments by specifying the firstInconsistentSegment. + */ + + if (state.lowestConsistentBound > 0) { + cascadingMerge(segmentInfos, + merger, + state.lowestConsistentBound, + state.firstInconsistentSegment); + } + + /* All of these merges will (should?) have left the sequence + * consistent, but there may be further possible merges. Try */ + + cascadingMerge(segmentInfos, merger); + + state = new ConsistencyState(segmentInfos, merger); + + /* Finally, it's possible this merger doesn't even own some of + * these segments. If all of the earlier merges left some external + * segments untouched, copy them in unchanged (except for garbage + * collected deleted docs */ + + if (state.inconsistentDirectories.size() > 0) { + + for(int i = 0; i < state.inconsistentDirectories.size(); i++) { + + int segment = ((Integer)(state.inconsistentDirectories.elementAt(i))).intValue(); + copy(segmentInfos, merger, segment, segment+1); + } + + /* The copy could have generated segments that, via deleted doc + * gc, no longer obey the invariants. So just try the whole + * thing again. This might be overkill? Or maybe we should + * actually be doing this in more cases? + */ + + makeConsistent(segmentInfos, merger); + + } + + return true; + + } + + /* MergeException is a placeholder for other exceptions in derived + * classes. LMP never throws it so if it catches one, it turns into + * a runtime error, which it is. + */ + + class MergeException extends Exception { + } + + protected void checkedMerge(SegmentInfos segmentInfos, + IndexMerger merger) + throws MergeException, CorruptIndexException, IOException { + + assert segmentInfos.size() > 0; + + /* Since any merge can result in inconsistency, we have to run + * makeConsistent aftewards anyway. Would it be better to combine + * the code and put it in a while loop? Would that be too much + * extra computation? Not sure yet ... + */ + + if (!makeConsistent(segmentInfos, merger)) { + cascadingMerge(segmentInfos, merger); + makeConsistent(segmentInfos, merger); + } + + } + + /** + *
The amount of free space required when a merge is + * triggered is up to 1X the size of all segments being + * merged, when no readers/searchers are open against the + * index, and up to 2X the size of all segments being + * merged when readers/searchers are open against the + * index (see {@link #optimize()} for details). Most + * merges are small (merging the smallest segments + * together), but whenever a full merge occurs (all + * segments in the index, which is the worst case for + * temporary space usage) then the maximum free disk space + * required is the same as {@link #optimize}.
+ */ + public void merge(SegmentInfos segmentInfos, + IndexMerger merger) + throws CorruptIndexException, IOException { + + try { + checkedMerge(segmentInfos, merger); + } catch (MergeException me) { + throw new RuntimeException ("merge exception"); + } + } + + protected void checkedOptimize(SegmentInfos segmentInfos, + IndexMerger merger) + throws MergeException, CorruptIndexException, IOException { + + if (segmentInfos.size() > 0) { + + while ((segmentInfos.size() > 1) || + SegmentReader.hasDeletions(segmentInfos.info(0)) || + SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || + (segmentInfos.info(0).dir != merger.getDirectory()) || + (useCompoundFile && (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))) { + + MergeSpecification spec = new MergeSpecification(); + + int first = segmentInfos.size() - mergeFactor; + first = first < 0 ? 0 : first; + int last = segmentInfos.size(); + + spec.segments = segmentInfos(segmentInfos, first, last); + spec.useCompoundFile = useCompoundFile; + + merge(spec, merger); + + } + + } + + } + + public void optimize(SegmentInfos segmentInfos, + IndexMerger merger) + throws CorruptIndexException, IOException { + try { + checkedOptimize(segmentInfos, merger); + } catch (MergeException me) { + throw new RuntimeException ("merge exception"); + } + } + + static public String segString(SegmentInfos segmentInfos, + int first, + int beyond, + IndexMerger merger) { + + StringBuffer buffer = new StringBuffer(); + + for(int i = first; i < beyond; i++) { + + if (i > first) { + buffer.append(' '); + } + + buffer.append('['); + buffer.append(i); + buffer.append(']'); + + SegmentInfo info = segmentInfos.info(i); + + try { + if (info.getUseCompoundFile()) { + buffer.append('c'); + } else { + buffer.append('C'); + } + } catch (Exception e) { + } + + if (info.dir != merger.getDirectory()) { + buffer.append('x'); + } + + buffer.append(info.docCount); + + } + + return buffer.toString(); + + } + + public String segString(SegmentInfos segmentInfos, + IndexMerger merger, + int first, + int beyond) { + + return segString(segmentInfos, merger, first, beyond); + } + + static public String segString(SegmentInfos segmentInfos, + IndexMerger merger) { + return segString(segmentInfos, 0, segmentInfos.size(), merger); + } + + static public String segString(MergeSpecification spec, IndexMerger merger) { + return segString(spec.segments, merger); + } + +} Property changes on: src/java/org/apache/lucene/index/LogDocMergePolicy.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/IndexModifier.java =================================================================== --- src/java/org/apache/lucene/index/IndexModifier.java (revision 563241) +++ src/java/org/apache/lucene/index/IndexModifier.java (working copy) @@ -101,12 +101,25 @@ // Lucene defaults: protected PrintStream infoStream = null; - protected boolean useCompoundFile = true; + + private MergePolicy mergePolicy = new LogDocMergePolicy(); + + /** + * @deprecated + * @see LogDocMergePolicy#getUseCompoundFile(boolean) + */ + protected boolean useCompoundFile = getUseCompoundFile();; + protected int maxBufferedDocs = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; protected int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; - protected int mergeFactor = IndexWriter.DEFAULT_MERGE_FACTOR; /** + * @deprecated + * @see: LogDocMergePolicy.DEFAULT_MERGE_FACTOR + */ + protected int mergeFactor = LogDocMergePolicy.DEFAULT_MERGE_FACTOR; + + /** * Open an index with write access. * * @param directory the index directory @@ -180,7 +193,7 @@ * Throw an IllegalStateException if the index is closed. * @throws IllegalStateException */ - protected void assureOpen() { + protected void ensureOpen() { if (!open) { throw new IllegalStateException("Index is closed"); } @@ -202,11 +215,10 @@ } indexWriter = new IndexWriter(directory, analyzer, false); indexWriter.setInfoStream(infoStream); - indexWriter.setUseCompoundFile(useCompoundFile); if (maxBufferedDocs != 0) indexWriter.setMaxBufferedDocs(maxBufferedDocs); indexWriter.setMaxFieldLength(maxFieldLength); - indexWriter.setMergeFactor(mergeFactor); + indexWriter.setMergePolicy(mergePolicy); } } @@ -218,6 +230,8 @@ protected void createIndexReader() throws CorruptIndexException, IOException { if (indexReader == null) { if (indexWriter != null) { + indexWriter.flush(); + indexWriter.releaseMergePolicy(); indexWriter.close(); indexWriter = null; } @@ -235,8 +249,10 @@ */ public void flush() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); if (indexWriter != null) { + indexWriter.flush(); + indexWriter.releaseMergePolicy(); indexWriter.close(); indexWriter = null; createIndexWriter(); @@ -263,7 +279,7 @@ */ public void addDocument(Document doc, Analyzer docAnalyzer) throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexWriter(); if (docAnalyzer != null) indexWriter.addDocument(doc, docAnalyzer); @@ -307,7 +323,7 @@ */ public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexReader(); return indexReader.deleteDocuments(term); } @@ -326,7 +342,7 @@ */ public void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexReader(); indexReader.deleteDocument(docNum); } @@ -341,7 +357,7 @@ */ public int docCount() { synchronized(directory) { - assureOpen(); + ensureOpen(); if (indexWriter != null) { return indexWriter.docCount(); } else { @@ -363,7 +379,7 @@ */ public void optimize() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexWriter(); indexWriter.optimize(); } @@ -378,7 +394,7 @@ */ public void setInfoStream(PrintStream infoStream) { synchronized(directory) { - assureOpen(); + ensureOpen(); if (indexWriter != null) { indexWriter.setInfoStream(infoStream); } @@ -396,31 +412,50 @@ */ public PrintStream getInfoStream() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexWriter(); return indexWriter.getInfoStream(); } } /** - * Setting to turn on usage of a compound file. When on, multiple files - * for each segment are merged into a single file once the segment creation - * is finished. This is done regardless of what directory is in use. - * @see IndexWriter#setUseCompoundFile(boolean) - * @throws IllegalStateException if the index is closed + * Set the merge policy used by thisIndexModifier
*/
- public void setUseCompoundFile(boolean useCompoundFile) {
- synchronized(directory) {
- assureOpen();
+ public void setMergePolicy(MergePolicy mp) throws CorruptIndexException, IOException {
+ ensureOpen();
+ if (mergePolicy != null && mergePolicy != mp) {
if (indexWriter != null) {
- indexWriter.setUseCompoundFile(useCompoundFile);
+ indexWriter.flush();
+ indexWriter.releaseMergePolicy();
}
- this.useCompoundFile = useCompoundFile;
+ mergePolicy.close();
}
+ mergePolicy = mp;
+ if (indexWriter != null) {
+ indexWriter.setMergePolicy(mergePolicy);
+ }
}
/**
- * @see IndexModifier#setUseCompoundFile(boolean)
+ * @see #setMergePolicy(MergePolicy)
+ */
+ public MergePolicy getMergePolicy() {
+ return mergePolicy;
+ }
+
+
+ /**
+ * @deprecated
+ * @see LogDocMergePolicy#setUseCompoundFile(boolean)
+ * @throws IllegalStateException if the index is closed
+ */
+ public void setUseCompoundFile(boolean value) {
+ ((LogDocMergePolicy) getMergePolicy()).setUseCompoundFile(value);
+ }
+
+ /**
+ * @deprecated
+ * @see LogDocMergePolicy#getUseCompoundFile(boolean)
* @throws CorruptIndexException if the index is corrupt
* @throws LockObtainFailedException if another writer
* has this index open (write.lock could not
@@ -428,11 +463,7 @@
* @throws IOException if there is a low-level IO error
*/
public boolean getUseCompoundFile() throws CorruptIndexException, LockObtainFailedException, IOException {
- synchronized(directory) {
- assureOpen();
- createIndexWriter();
- return indexWriter.getUseCompoundFile();
- }
+ return ((LogDocMergePolicy) getMergePolicy()).getUseCompoundFile();
}
/**
@@ -451,7 +482,7 @@
*/
public void setMaxFieldLength(int maxFieldLength) {
synchronized(directory) {
- assureOpen();
+ ensureOpen();
if (indexWriter != null) {
indexWriter.setMaxFieldLength(maxFieldLength);
}
@@ -469,7 +500,7 @@
*/
public int getMaxFieldLength() throws CorruptIndexException, LockObtainFailedException, IOException {
synchronized(directory) {
- assureOpen();
+ ensureOpen();
createIndexWriter();
return indexWriter.getMaxFieldLength();
}
@@ -479,8 +510,7 @@
* Determines the minimal number of documents required before the buffered
* in-memory documents are merging and a new Segment is created.
* Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
- * large value gives faster indexing. At the same time, mergeFactor limits
- * the number of files open in a FSDirectory.
+ * large value gives faster indexing.
*
* The default value is 10. * @@ -490,7 +520,7 @@ */ public void setMaxBufferedDocs(int maxBufferedDocs) { synchronized(directory) { - assureOpen(); + ensureOpen(); if (indexWriter != null) { indexWriter.setMaxBufferedDocs(maxBufferedDocs); } @@ -508,36 +538,26 @@ */ public int getMaxBufferedDocs() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { - assureOpen(); + ensureOpen(); createIndexWriter(); return indexWriter.getMaxBufferedDocs(); } } /** - * Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - *
This must never be less than 2. The default value is 10.
- *
- * @see IndexWriter#setMergeFactor(int)
+ * @deprecated
+ * @see LogDocMergePolicy#setMergeFactor(int)
* @throws IllegalStateException if the index is closed
*/
public void setMergeFactor(int mergeFactor) {
synchronized(directory) {
- assureOpen();
- if (indexWriter != null) {
- indexWriter.setMergeFactor(mergeFactor);
- }
- this.mergeFactor = mergeFactor;
+ ensureOpen();
+ ((LogDocMergePolicy) getMergePolicy()).setMergeFactor(mergeFactor);
}
}
/**
+ * @deprecated
* @see IndexModifier#setMergeFactor(int)
* @throws CorruptIndexException if the index is corrupt
* @throws LockObtainFailedException if another writer
@@ -547,9 +567,8 @@
*/
public int getMergeFactor() throws CorruptIndexException, LockObtainFailedException, IOException {
synchronized(directory) {
- assureOpen();
- createIndexWriter();
- return indexWriter.getMergeFactor();
+ ensureOpen();
+ return ((LogDocMergePolicy) getMergePolicy()).getMergeFactor();
}
}
@@ -565,12 +584,17 @@
if (!open)
throw new IllegalStateException("Index is closed already");
if (indexWriter != null) {
+ indexWriter.flush();
+ indexWriter.releaseMergePolicy();
indexWriter.close();
indexWriter = null;
} else {
indexReader.close();
indexReader = null;
}
+ if (mergePolicy != null) {
+ mergePolicy.close();
+ }
open = false;
}
}
Index: src/java/org/apache/lucene/index/IndexMerger.java
===================================================================
--- src/java/org/apache/lucene/index/IndexMerger.java (revision 0)
+++ src/java/org/apache/lucene/index/IndexMerger.java (revision 0)
@@ -0,0 +1,80 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.Directory;
+
+import java.io.IOException;
+
+/**
+ * The set of operations needed by a MergePolicy object
+ * to trigger individual primitive merge operations. Currently only
+ * implemented by IndexWriter.
+ */
+
+interface IndexMerger {
+
+ /**
+ * The primary primitive merge operation
+ *
+ * @param m specification of the desired merge
+ * @return number of documents in the new segment (or -1 if indeterminate)
+ */
+
+ int merge(MergePolicy.MergeSpecification m)
+ throws CorruptIndexException, IOException;
+
+ /**
+ * Identifies the target directory of a merge so that the merge
+ * policy can determine if segments need to be copied between
+ * directories.
+ *
+ * @return the directory that new segments will be created within.
+ */
+ Directory getDirectory();
+
+ /**
+ * Returns 0 if this writer is flushing by RAM usage, else
+ * returns the number of buffered added documents that will
+ * trigger a flush.
+ * @see #setMaxBufferedDocs
+ */
+ int getMaxBufferedDocs();
+
+ /**
+ * Returns 0.0 if this writer is flushing by document
+ * count, else returns the value set by {@link
+ * #setRAMBufferSizeMB}.
+ */
+ double getRAMBufferSizeMB();
+
+ /**
+ * High-level optimize request; generally will result in one or more
+ * primiteve merge requests via the merge policy.
+ */
+ void optimize()
+ throws CorruptIndexException, IOException;
+
+ /**
+ * High-level merge request; generally will result in one or more
+ * primitive merge requests via the merge policy.
+ */
+ void merge()
+ throws CorruptIndexException, IOException;
+
+}
Property changes on: src/java/org/apache/lucene/index/IndexMerger.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/java/org/apache/lucene/index/MergePolicy.java
===================================================================
--- src/java/org/apache/lucene/index/MergePolicy.java (revision 0)
+++ src/java/org/apache/lucene/index/MergePolicy.java (revision 0)
@@ -0,0 +1,104 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/**
+ * A merge policy determines the sequence of primitive merge
+ * operations to be used for overall merge and optimize
+ * operations. Each merge or optimize call on the merge policy results
+ * in zero or more primitive merge calls on an
+ * IndexMerger. The merge policy is called by
+ * IndexWriter with the current list of all segments and
+ * it, in turn, calls to an IndexMerger (typically the
+ * writer, again), with individual merge requests, each of which
+ * generates a single new segment from the contents of a subset of all
+ * segments.
+ */
+
+/* Notes: a merge policy piggybacks on the synchronization of the
+ * caller (persuably the merger, e.g., IndexWriter). Non-concurrent
+ * mergers do little internal synchronization.
+ */
+
+interface MergePolicy {
+
+ /**
+ * A merge specification provides the information necessary to
+ * perform an individual primitive merge operation, resulting in a
+ * single new segment. The merge spec includes the subset of
+ * segments to be merged as well as various options such as whether
+ * the new segment should or should not be in compound format.
+ */
+
+ public static class MergeSpecification implements Cloneable {
+ /**
+ * The subset of segments to be included in the primitive merge.
+ */
+ public SegmentInfos segments;
+
+ /**
+ * Indicate the format to be used in the resulting segment.
+ */
+ public boolean useCompoundFile;
+
+ /**
+ * Publicly avaliable clone().
+ */
+ public MergeSpecification copy() {
+ try {
+ return (MergeSpecification)super.clone();
+ } catch (CloneNotSupportedException cnse) {
+ throw new RuntimeException ("clone not supported on a MergeSpecification subclass");
+ }
+ }
+ }
+
+ /**
+ * Merge a sequence of segments. The policy determines what set of
+ * primitive merge operations constitute a high-level merge.
+ *
+ * @param segmentInfos the total set of segments in the index
+ */
+ void merge(SegmentInfos segmentInfos,
+ IndexMerger merger)
+ throws CorruptIndexException, IOException;
+
+ /**
+ * Optimize a sequence of segments. The policy determines what set of
+ * primitive merge operations constitute an optimize.
+ *
+ * @param segmentInfos the total set of segments in the index
+ */
+ void optimize(SegmentInfos segmentInfos,
+ IndexMerger merger)
+ throws CorruptIndexException, IOException;
+
+ /**
+ * Release all resources for the policy.
+ */
+ void close();
+
+ /**
+ * Returns an indication of whether a new (not from merge) segment
+ * should be compound or not.
+ */
+ boolean useCompoundFile(SegmentInfos segments, SegmentInfo newSegment);
+
+}
Property changes on: src/java/org/apache/lucene/index/MergePolicy.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/IndexWriter.java (revision 563241)
+++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
@@ -161,7 +161,7 @@
* referenced by the "front" of the index). For this, IndexFileDeleter
* keeps track of the last non commit checkpoint.
*/
-public class IndexWriter {
+public class IndexWriter implements IndexMerger {
/**
* Default value for the write lock timeout (1,000).
@@ -177,9 +177,10 @@
public static final String WRITE_LOCK_NAME = "write.lock";
/**
- * Default value is 10. Change using {@link #setMergeFactor(int)}.
+ * @deprecated
+ * @see: LogDocMergePolicy.DEFAULT_MERGE_FACTOR
*/
- public final static int DEFAULT_MERGE_FACTOR = 10;
+ public final static int DEFAULT_MERGE_FACTOR = LogDocMergePolicy.DEFAULT_MERGE_FACTOR;
/**
* Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
@@ -205,7 +206,8 @@
public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = 1000;
/**
- * Default value is {@link Integer#MAX_VALUE}. Change using {@link #setMaxMergeDocs(int)}.
+ * @deprecated
+ * @see: LogDocMergePolicy.DEFAULT_MAX_MERGE_DOCS
*/
public final static int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
@@ -255,14 +257,8 @@
// The key is delete term; the value is number of ram
// segments the term applies to.
private HashMap bufferedDeleteTerms = new HashMap();
- private int numBufferedDeleteTerms = 0;
+ /* private */ int numBufferedDeleteTerms = 0;
- /** Use compound file setting. Defaults to true, minimizing the number of
- * files used. Setting this to false may improve indexing performance, but
- * may also cause file handle problems.
- */
- private boolean useCompoundFile = true;
-
private boolean closeDir;
private boolean closed;
@@ -278,23 +274,22 @@
}
}
- /** Get the current setting of whether to use the compound file format.
- * Note that this just returns the value you set with setUseCompoundFile(boolean)
- * or the default. You cannot use this to query the status of an existing index.
- * @see #setUseCompoundFile(boolean)
+ /**
+ * @deprecated
+ * @see LogDocMergePolicy#getUseCompoundFile()
+ *
*/
public boolean getUseCompoundFile() {
- ensureOpen();
- return useCompoundFile;
+ return ((LogDocMergePolicy) getMergePolicy()).getUseCompoundFile();
}
- /** Setting to turn on usage of a compound file. When on, multiple files
- * for each segment are merged into a single file once the segment creation
- * is finished. This is done regardless of what directory is in use.
+ /**
+ * @deprecated
+ * @see LogDocMergePolicy#setUseCompoundFile(boolean)
+ *
*/
public void setUseCompoundFile(boolean value) {
- ensureOpen();
- useCompoundFile = value;
+ ((LogDocMergePolicy) getMergePolicy()).setUseCompoundFile(value);
}
/** Expert: Set the Similarity implementation used by this IndexWriter.
@@ -652,16 +647,17 @@
}
}
- /** Determines the largest number of documents ever merged by addDocument().
- * Small values (e.g., less than 10,000) are best for interactive indexing,
- * as this limits the length of pauses while indexing to a few seconds.
- * Larger values are best for batched indexing and speedier searches.
- *
- *
The default value is {@link Integer#MAX_VALUE}.
+ private MergePolicy mergePolicy = new LogDocMergePolicy();
+
+ /**
+ * Set the merge policy used by this IndexWriter
*/
- public void setMaxMergeDocs(int maxMergeDocs) {
+ public void setMergePolicy(MergePolicy mp) {
ensureOpen();
- this.maxMergeDocs = maxMergeDocs;
+ if ( mergePolicy != null && mergePolicy != mp ) {
+ mergePolicy.close();
+ }
+ mergePolicy = mp;
}
/**
@@ -669,12 +665,40 @@
* single segment.
* @see #setMaxMergeDocs
*/
- public int getMaxMergeDocs() {
+ public MergePolicy getMergePolicy() {
ensureOpen();
- return maxMergeDocs;
+ return mergePolicy;
}
/**
+ * Release the merge policy used by this code.
+ *
+ * Currently used by
+ * IndexModifier since it "owns" the merge policy and thus the
+ * policy shouldn't be closed when the writer is closed.
+ */
+ public void releaseMergePolicy() {
+ ensureOpen();
+ mergePolicy = null;
+ }
+
+ /**
+ * @deprecated
+ * @see MergePolicy#setMaxMergeDocs(int)
+ */
+ public void setMaxMergeDocs(int maxMergeDocs) {
+ ((LogDocMergePolicy) getMergePolicy()).setMaxMergeDocs(maxMergeDocs);
+ }
+
+ /**
+ * @deprecated
+ * @see MergePolicy#getMaxMergeDocs()
+ */
+ public int getMaxMergeDocs() {
+ return ((LogDocMergePolicy) getMergePolicy()).getMaxMergeDocs();
+ }
+
+ /**
* The maximum number of terms that will be indexed for a single field in a
* document. This limits the amount of memory required for indexing, so that
* collections with very large files will not crash the indexing process by
@@ -717,6 +741,7 @@
* @throws IllegalArgumentException if maxBufferedDocs is
* smaller than 2
* @see #setRAMBufferSizeMB
+ *
*/
public void setMaxBufferedDocs(int maxBufferedDocs) {
ensureOpen();
@@ -788,32 +813,20 @@
return maxBufferedDeleteTerms;
}
- /** Determines how often segment indices are merged by addDocument(). With
- * smaller values, less RAM is used while indexing, and searches on
- * unoptimized indices are faster, but indexing speed is slower. With larger
- * values, more RAM is used during indexing, and while searches on unoptimized
- * indices are slower, indexing is faster. Thus larger values (> 10) are best
- * for batch index creation, and smaller values (< 10) for indices that are
- * interactively maintained.
- *
- *
This must never be less than 2. The default value is 10. + /** + * @deprecated + * @see LogDocMergePolicy#setMergeFactor(int) */ public void setMergeFactor(int mergeFactor) { - ensureOpen(); - if (mergeFactor < 2) - throw new IllegalArgumentException("mergeFactor cannot be less than 2"); - this.mergeFactor = mergeFactor; + ((LogDocMergePolicy) getMergePolicy()).setMergeFactor(mergeFactor); } /** - * Returns the number of segments that are merged at once - * and also controls the total number of segments allowed - * to accumulate in the index. - * @see #setMergeFactor + * @deprecated + * @see LogDocMergePolicy#getMergeFactor */ public int getMergeFactor() { - ensureOpen(); - return mergeFactor; + return ((LogDocMergePolicy) getMergePolicy()).getMergeFactor(); } /** If non-null, this will be the default infoStream used @@ -839,7 +852,6 @@ */ public void setInfoStream(PrintStream infoStream) { ensureOpen(); - this.infoStream = infoStream; docWriter.setInfoStream(infoStream); deleter.setInfoStream(infoStream); } @@ -935,6 +947,11 @@ rollbackSegmentInfos = null; } + if (mergePolicy != null) { + mergePolicy.close(); + mergePolicy = null; + } + if (writeLock != null) { writeLock.release(); // release write lock writeLock = null; @@ -965,6 +982,10 @@ docWriter.abort(); } + /* FIXME(?) get doc store segment? */ + boolean useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, + null); + if (useCompoundFile && docStoreSegment != null) { // Now build compound doc store file checkpoint(); @@ -1073,23 +1094,19 @@ *
This method periodically flushes pending documents * to the Directory (every {@link #setMaxBufferedDocs}), * and also periodically merges segments in the index - * (every {@link #setMergeFactor} flushes). When this + * by calling {@link MergePolicy#setMergeFactor}. When this * occurs, the method will take more time to run (possibly * a long time if the index is large), and will require * free temporary space in the Directory to do the * merging.
* - *The amount of free space required when a merge is - * triggered is up to 1X the size of all segments being - * merged, when no readers/searchers are open against the - * index, and up to 2X the size of all segments being - * merged when readers/searchers are open against the - * index (see {@link #optimize()} for details). Most - * merges are small (merging the smallest segments - * together), but whenever a full merge occurs (all - * segments in the index, which is the worst case for - * temporary space usage) then the maximum free disk space - * required is the same as {@link #optimize}.
+ *The amount of free space required when a merge is triggered is + * up to 1X the size of all segments being merged, when no + * readers/searchers are open against the index, and up to 2X the + * size of all segments being merged when readers/searchers are open + * against the index (see {@link #optimize()} for details). The + * sequence of primitive merge operations performed is governed by + * the merge policy. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error @@ -1117,6 +1134,8 @@ try { success = docWriter.addDocument(doc, analyzer); } catch (IOException ioe) { + bufferedDeleteTerms.clear(); + numBufferedDeleteTerms = 0; deleter.refresh(); throw ioe; } @@ -1192,6 +1211,8 @@ try { success = docWriter.addDocument(doc, analyzer); } catch (IOException ioe) { + bufferedDeleteTerms.clear(); + numBufferedDeleteTerms = 0; deleter.refresh(); throw ioe; } @@ -1224,51 +1245,33 @@ return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *
This must never be less than 2. The default value is {@link #DEFAULT_MERGE_FACTOR}. - - */ - private int mergeFactor = DEFAULT_MERGE_FACTOR; - /** Determines amount of RAM usage by the buffered docs at * which point we trigger a flush to the index. */ private double ramBufferSize = DEFAULT_RAM_BUFFER_SIZE_MB*1024F*1024F; - /** Determines the largest number of documents ever merged by addDocument(). - * Small values (e.g., less than 10,000) are best for interactive indexing, - * as this limits the length of pauses while indexing to a few seconds. - * Larger values are best for batched indexing and speedier searches. - * - *
The default value is {@link #DEFAULT_MAX_MERGE_DOCS}. - - */ - private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; - /** If non-null, information about merges will be printed to this. */ private PrintStream infoStream = null; - private static PrintStream defaultInfoStream = null; - /** Merges all segments together into a single segment, - * optimizing an index for search. + /** + * Requests an "optimize" operation on an index, priming the index + * for the fastest available search. Traditionally this has meant + * merging all segments into a single segment as is done in the + * default merge policy, but individaul merge policies may implement + * optimize in different ways. * + * @see LogDocMergePolicy#optimize(SegmentInfos) + * *
It is recommended that this method be called upon completion of indexing. In * environments with frequent updates, optimize is best done during low volume times, if at all. * *
*See http://www.gossamer-threads.com/lists/lucene/java-dev/47895 for more discussion.
* - *Note that this requires substantial temporary free + *
Note that this can require substantial temporary free * space in the Directory (see LUCENE-764 * for details):
@@ -1306,7 +1309,7 @@ *The actual temporary usage could be much less than * these figures (it depends on many factors).
* - *Once the optimize completes, the total size of the + *
In general, once the optimize completes, the total size of the * index will be less than the size of the starting index. * It could be quite a bit smaller (if there were many * pending deletes) or just slightly smaller.
@@ -1326,18 +1329,21 @@ public synchronized void optimize() throws CorruptIndexException, IOException { ensureOpen(); flush(); - while (segmentInfos.size() > 1 || - (segmentInfos.size() == 1 && - (SegmentReader.hasDeletions(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || - segmentInfos.info(0).dir != directory || - (useCompoundFile && - !segmentInfos.info(0).getUseCompoundFile())))) { - int minSegment = segmentInfos.size() - mergeFactor; - mergeSegments(minSegment < 0 ? 0 : minSegment, segmentInfos.size()); - } + mergePolicy.optimize(segmentInfos, this); } + /** + * Requests a merge operation on an index. In memory data is first + * flushed to disk and then the merge policy called. + * + * Explict calls to merge() are usually not necessary. The most + * common case when they would be used are when merge policy + * parameters change or when concurrent merges are used. + */ + public synchronized void merge() throws CorruptIndexException, IOException { + flush(); + } + /* * Begin a transaction. During a transaction, any segment * merges that happen (or ram segments flushed) will not @@ -1351,7 +1357,7 @@ * within the transactions, so they must be flushed before the * transaction is started. */ - private void startTransaction() throws IOException { + void startTransaction() throws IOException { assert numBufferedDeleteTerms == 0 : "calling startTransaction with buffered delete terms not supported"; @@ -1374,7 +1380,7 @@ * Rolls back the transaction and restores state to where * we were at the start. */ - private void rollbackTransaction() throws IOException { + void rollbackTransaction() throws IOException { // First restore autoCommit in case we hit an exception below: autoCommit = localAutoCommit; @@ -1403,7 +1409,7 @@ * segments file and remove and pending deletions we have * accumulated during the transaction */ - private void commitTransaction() throws IOException { + void commitTransaction() throws IOException { // First restore autoCommit in case we hit an exception below: autoCommit = localAutoCommit; @@ -1451,6 +1457,10 @@ segmentInfos.clear(); segmentInfos.addAll(rollbackSegmentInfos); + // discard any buffered delete terms so they aren't applied in close() + bufferedDeleteTerms.clear(); + numBufferedDeleteTerms = 0; + docWriter.abort(); // Ask deleter to locate unreferenced files & remove @@ -1458,9 +1468,6 @@ deleter.checkpoint(segmentInfos, false); deleter.refresh(); - bufferedDeleteTerms.clear(); - numBufferedDeleteTerms = 0; - commitPending = false; docWriter.abort(); close(); @@ -1537,7 +1544,7 @@ throws CorruptIndexException, IOException { ensureOpen(); - optimize(); // start with zero or 1 seg + flush(); int start = segmentInfos.size(); @@ -1554,15 +1561,8 @@ } } - // merge newly added segments in log(n) passes - while (segmentInfos.size() > start+mergeFactor) { - for (int base = start; base < segmentInfos.size(); base++) { - int end = Math.min(segmentInfos.size(), base+mergeFactor); - if (end-base > 1) { - mergeSegments(base, end); - } - } - } + mergePolicy.optimize(segmentInfos, this); + success = true; } finally { if (success) { @@ -1571,8 +1571,7 @@ rollbackTransaction(); } } - - optimize(); // final cleanup + } /** @@ -1594,40 +1593,10 @@ */ public synchronized void addIndexesNoOptimize(Directory[] dirs) throws CorruptIndexException, IOException { - // Adding indexes can be viewed as adding a sequence of segments S to - // a sequence of segments T. Segments in T follow the invariants but - // segments in S may not since they could come from multiple indexes. - // Here is the merge algorithm for addIndexesNoOptimize(): - // - // 1 Flush ram. - // 2 Consider a combined sequence with segments from T followed - // by segments from S (same as current addIndexes(Directory[])). - // 3 Assume the highest level for segments in S is h. Call - // maybeMergeSegments(), but instead of starting w/ lowerBound = -1 - // and upperBound = maxBufferedDocs, start w/ lowerBound = -1 and - // upperBound = upperBound of level h. After this, the invariants - // are guaranteed except for the last < M segments whose levels <= h. - // 4 If the invariants hold for the last < M segments whose levels <= h, - // if some of those < M segments are from S (not merged in step 3), - // properly copy them over*, otherwise done. - // Otherwise, simply merge those segments. If the merge results in - // a segment of level <= h, done. Otherwise, it's of level h+1 and call - // maybeMergeSegments() starting w/ upperBound = upperBound of level h+1. - // - // * Ideally, we want to simply copy a segment. However, directory does - // not support copy yet. In addition, source may use compound file or not - // and target may use compound file or not. So we use mergeSegments() to - // copy a segment, which may cause doc count to change because deleted - // docs are garbage collected. - // 1 flush ram - ensureOpen(); flush(); - // 2 copy segment infos and find the highest level from dirs - int startUpperBound = docWriter.getMaxBufferedDocs(); - /* new merge policy if (startUpperBound == 0) startUpperBound = 10; @@ -1650,64 +1619,13 @@ for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); segmentInfos.addElement(info); // add each info - - while (startUpperBound < info.docCount) { - startUpperBound *= mergeFactor; // find the highest level from dirs - if (startUpperBound > maxMergeDocs) { - // upper bound cannot exceed maxMergeDocs - throw new IllegalArgumentException("Upper bound cannot exceed maxMergeDocs"); - } - } } } - // 3 maybe merge segments starting from the highest level from dirs - maybeMergeSegments(startUpperBound); + mergePolicy.merge(segmentInfos, this); - // get the tail segments whose levels <= h - int segmentCount = segmentInfos.size(); - int numTailSegments = 0; - while (numTailSegments < segmentCount - && startUpperBound >= segmentInfos.info(segmentCount - 1 - numTailSegments).docCount) { - numTailSegments++; - } - if (numTailSegments == 0) { - success = true; - return; - } - - // 4 make sure invariants hold for the tail segments whose levels <= h - if (checkNonDecreasingLevels(segmentCount - numTailSegments)) { - // identify the segments from S to be copied (not merged in 3) - int numSegmentsToCopy = 0; - while (numSegmentsToCopy < segmentCount - && directory != segmentInfos.info(segmentCount - 1 - numSegmentsToCopy).dir) { - numSegmentsToCopy++; - } - if (numSegmentsToCopy == 0) { - success = true; - return; - } - - // copy those segments from S - for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) { - mergeSegments(i, i + 1); - } - if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) { - success = true; - return; - } - } - - // invariants do not hold, simply merge those segments - mergeSegments(segmentCount - numTailSegments, segmentCount); - - // maybe merge segments again if necessary - if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) { - maybeMergeSegments(startUpperBound * mergeFactor); - } - success = true; + } finally { if (success) { commitTransaction(); @@ -1717,14 +1635,9 @@ } } - /** Merges the provided indexes into this index. - *After this completes, the index is optimized.
+ /** + *See {@link #addIndexes(Directory[])} *
The provided IndexReaders are not closed.
- - *See {@link #addIndexes(Directory[])} for - * details on transactional semantics, temporary free - * space required in the Directory, and non-CFS segments - * on an Exception.
* @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ @@ -1732,108 +1645,16 @@ throws CorruptIndexException, IOException { ensureOpen(); - optimize(); // start with zero or 1 seg - final String mergedName = newSegmentName(); - SegmentMerger merger = new SegmentMerger(this, mergedName); - - SegmentInfo info; - - IndexReader sReader = null; - try { - if (segmentInfos.size() == 1){ // add existing index, if any - sReader = SegmentReader.get(segmentInfos.info(0)); - merger.add(sReader); - } - - for (int i = 0; i < readers.length; i++) // add new indexes - merger.add(readers[i]); - - boolean success = false; - - startTransaction(); - - try { - int docCount = merger.merge(); // merge 'em - - if(sReader != null) { - sReader.close(); - sReader = null; - } - - segmentInfos.setSize(0); // pop old infos & add new - info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false); - segmentInfos.addElement(info); - - success = true; - - } finally { - if (!success) { - rollbackTransaction(); - } else { - commitTransaction(); - } - } - } finally { - if (sReader != null) { - sReader.close(); - } + Directory[] dirs = new Directory[ readers.length ]; + for( int i = 0; i < dirs.length; i++ ) { + dirs[i] = readers[i].directory(); } - - if (useCompoundFile) { - boolean success = false; + addIndexes( dirs ); - startTransaction(); - - try { - merger.createCompoundFile(mergedName + ".cfs"); - info.setUseCompoundFile(true); - } finally { - if (!success) { - rollbackTransaction(); - } else { - commitTransaction(); - } - } - } } - // Overview of merge policy: - // - // A flush is triggered either by close() or by the number of ram segments - // reaching maxBufferedDocs. After a disk segment is created by the flush, - // further merges may be triggered. - // - // LowerBound and upperBound set the limits on the doc count of a segment - // which may be merged. Initially, lowerBound is set to 0 and upperBound - // to maxBufferedDocs. Starting from the rightmost* segment whose doc count - // > lowerBound and <= upperBound, count the number of consecutive segments - // whose doc count <= upperBound. - // - // Case 1: number of worthy segments < mergeFactor, no merge, done. - // Case 2: number of worthy segments == mergeFactor, merge these segments. - // If the doc count of the merged segment <= upperBound, done. - // Otherwise, set lowerBound to upperBound, and multiply upperBound - // by mergeFactor, go through the process again. - // Case 3: number of worthy segments > mergeFactor (in the case mergeFactor - // M changes), merge the leftmost* M segments. If the doc count of - // the merged segment <= upperBound, consider the merged segment for - // further merges on this same level. Merge the now leftmost* M - // segments, and so on, until number of worthy segments < mergeFactor. - // If the doc count of all the merged segments <= upperBound, done. - // Otherwise, set lowerBound to upperBound, and multiply upperBound - // by mergeFactor, go through the process again. - // Note that case 2 can be considerd as a special case of case 3. - // - // This merge policy guarantees two invariants if M does not change and - // segment doc count is not reaching maxMergeDocs: - // B for maxBufferedDocs, f(n) defined as ceil(log_M(ceil(n/B))) - // 1: If i (left*) and i+1 (right*) are two consecutive segments of doc - // counts x and y, then f(x) >= f(y). - // 2: The number of committed segments on the same level (f(n)) <= M. - // This is called after pending added and deleted // documents have been flushed to the Directory but before // the change is committed (new segments_N file written). @@ -1906,11 +1727,12 @@ // apply to more than just the last flushed segment boolean flushDeletes = bufferedDeleteTerms.size() > 0; - if (infoStream != null) + if (true && infoStream != null) infoStream.println(" flush: flushDocs=" + flushDocs + " flushDeletes=" + flushDeletes + " flushDocStores=" + flushDocStores + - " numDocs=" + numDocs); + " numDocs=" + numDocs + + " numBufDelTerms=" + numBufferedDeleteTerms); int docStoreOffset = docWriter.getDocStoreOffset(); boolean docStoreIsCompoundFile = false; @@ -1925,7 +1747,8 @@ flushDocStores(); flushDocStores = false; - docStoreIsCompoundFile = useCompoundFile; + /* FIXME(?) get doc store segment(?) */ + docStoreIsCompoundFile = mergePolicy.useCompoundFile(segmentInfos,null); } String segment = docWriter.getSegment(); @@ -1988,6 +1811,8 @@ segmentInfos.clear(); segmentInfos.addAll(rollback); + // System.err.println( "restore buf deletes " + saveNumBufferedDeleteTerms + " " + + // numBufferedDeleteTerms ); if (saveBufferedDeleteTerms != null) { numBufferedDeleteTerms = saveNumBufferedDeleteTerms; bufferedDeleteTerms = saveBufferedDeleteTerms; @@ -2009,7 +1834,8 @@ deleter.checkpoint(segmentInfos, autoCommit); - if (flushDocs && useCompoundFile) { + if (flushDocs && mergePolicy.useCompoundFile(segmentInfos, + newSegment)) { success = false; try { docWriter.createCompoundFile(segment); @@ -2026,14 +1852,15 @@ deleter.checkpoint(segmentInfos, autoCommit); } - /* new merge policy - if (0 == docWriter.getMaxBufferedDocs()) - maybeMergeSegments(mergeFactor * numDocs / 2); - else - maybeMergeSegments(docWriter.getMaxBufferedDocs()); - */ - if (triggerMerge) - maybeMergeSegments(docWriter.getMaxBufferedDocs()); + if (triggerMerge) { + /* new merge policy + if (0 == docWriter.getMaxBufferedDocs()) + mergePolicy.merge(segmentInfos,mergeFactor * numDocs / 2); + else + mergePolicy.merge(segmentInfos,docWriter.getMaxBufferedDocs()); + */ + mergePolicy.merge(segmentInfos, this); + } } } finally { docWriter.clearFlushPending(); @@ -2057,79 +1884,46 @@ return docWriter.getNumDocsInRAM(); } - /** Incremental segment merger. */ - private final void maybeMergeSegments(int startUpperBound) throws CorruptIndexException, IOException { - long lowerBound = -1; - long upperBound = startUpperBound; + /* FIXME if we want to support non-contiguous segment merges */ + synchronized void replace(MergePolicy.MergeSpecification spec, SegmentInfo info) { - /* new merge policy - if (upperBound == 0) upperBound = 10; - */ + int first = segmentInfos.indexOf(spec.segments.info(0)); + int last = segmentInfos.indexOf(spec.segments.info(spec.segments.size() - 1)); - while (upperBound < maxMergeDocs) { - int minSegment = segmentInfos.size(); - int maxSegment = -1; + last++; - // find merge-worthy segments - while (--minSegment >= 0) { - SegmentInfo si = segmentInfos.info(minSegment); + if (!(first >= 0) || !(last >= 1) || !(last - first == spec.segments.size())) { + throw new RuntimeException("bad replace spec"); + } - if (maxSegment == -1 && si.docCount > lowerBound && si.docCount <= upperBound) { - // start from the rightmost* segment whose doc count is in bounds - maxSegment = minSegment; - } else if (si.docCount > upperBound) { - // until the segment whose doc count exceeds upperBound - break; - } - } + // System.err.println( "before ++" ); + for( int i=0; i < segmentInfos.size(); i++ ) { + // System.err.println( segmentInfos.info(i).name ); + } - minSegment++; - maxSegment++; - int numSegments = maxSegment - minSegment; + segmentInfos.subList(first, last).clear(); + segmentInfos.add(first, info); - if (numSegments < mergeFactor) { - break; - } else { - boolean exceedsUpperLimit = false; - - // number of merge-worthy segments may exceed mergeFactor when - // mergeFactor and/or maxBufferedDocs change(s) - while (numSegments >= mergeFactor) { - // merge the leftmost* mergeFactor segments - - int docCount = mergeSegments(minSegment, minSegment + mergeFactor); - numSegments -= mergeFactor; - - if (docCount > upperBound) { - // continue to merge the rest of the worthy segments on this level - minSegment++; - exceedsUpperLimit = true; - } else { - // if the merged segment does not exceed upperBound, consider - // this segment for further merges on this same level - numSegments++; - } - } - - if (!exceedsUpperLimit) { - // if none of the merged segments exceed upperBound, done - break; - } - } - - lowerBound = upperBound; - upperBound *= mergeFactor; + // System.err.println( "after ++" ); + for( int i=0; i < segmentInfos.size(); i++ ) { + // System.err.println( segmentInfos.info(i).name ); } + } /** - * Merges the named range of segments, replacing them in the stack with a + * Merges the indicated segments, replacing them in the stack with a * single segment. */ - private final int mergeSegments(int minSegment, int end) + public final int merge(MergePolicy.MergeSpecification spec) throws CorruptIndexException, IOException { + SegmentInfos sourceSegments = spec.segments; + + int minSegment = 0; + int end = sourceSegments.size(); + final String mergedName = newSegmentName(); SegmentMerger merger = null; @@ -2156,7 +1950,7 @@ // Test each segment to be merged for (int i = minSegment; i < end; i++) { - SegmentInfo si = segmentInfos.info(i); + SegmentInfo si = sourceSegments.info(i); // If it has deletions we must merge the doc stores if (si.hasDeletions()) @@ -2207,7 +2001,7 @@ docStoreSegment = null; docStoreIsCompoundFile = false; } else { - SegmentInfo si = segmentInfos.info(minSegment); + SegmentInfo si = sourceSegments.info(minSegment); docStoreOffset = si.getDocStoreOffset(); docStoreSegment = si.getDocStoreSegment(); docStoreIsCompoundFile = si.getDocStoreIsCompoundFile(); @@ -2223,7 +2017,7 @@ merger = new SegmentMerger(this, mergedName); for (int i = minSegment; i < end; i++) { - SegmentInfo si = segmentInfos.info(i); + SegmentInfo si = sourceSegments.info(i); if (infoStream != null) infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, mergeDocStores); // no need to set deleter (yet) @@ -2249,13 +2043,10 @@ docStoreSegment, docStoreIsCompoundFile); - rollback = (SegmentInfos) segmentInfos.clone(); + rollback = (SegmentInfos) sourceSegments.clone(); - for (int i = end-1; i > minSegment; i--) // remove old infos & add new - segmentInfos.remove(i); + replace(spec, newSegment); - segmentInfos.set(minSegment, newSegment); - checkpoint(); success = true; @@ -2267,8 +2058,8 @@ // instances, but keep original SegmentInfos // instance (so we don't try to write again the // same segments_N file -- write once): - segmentInfos.clear(); - segmentInfos.addAll(rollback); + sourceSegments.clear(); + sourceSegments.addAll(rollback); } // Delete any partially created and now unreferenced files: @@ -2285,6 +2076,8 @@ // Give deleter a chance to remove files now. deleter.checkpoint(segmentInfos, autoCommit); + boolean useCompoundFile = spec.useCompoundFile; + if (useCompoundFile) { boolean success = false; @@ -2311,6 +2104,7 @@ return mergedDocCount; } + // Called during flush to apply any buffered deletes. If // flushedNewSegment is true then a new segment was just // created and flushed from the ram segments, so we will @@ -2333,6 +2127,7 @@ // Apply delete terms to the segment just flushed from ram // apply appropriately so that a delete term is only applied to // the documents buffered before it, not those buffered after it. + // System.err.println("apply deletes to new"); applyDeletesSelectively(bufferedDeleteTerms, reader); } finally { if (reader != null) { @@ -2357,6 +2152,7 @@ // Apply delete terms to disk segments // except the one just flushed from ram. + // System.err.println("apply deletes to " + i); applyDeletes(bufferedDeleteTerms, reader); } finally { if (reader != null) { @@ -2370,7 +2166,6 @@ } // Clean up bufferedDeleteTerms. - // Rollbacks of buffered deletes are based on restoring the old // map, so don't modify this one. Rare enough that the gc // overhead is almost certainly lower than the alternate, which @@ -2381,29 +2176,6 @@ } } - private final boolean checkNonDecreasingLevels(int start) { - int lowerBound = -1; - int upperBound = docWriter.getMaxBufferedDocs(); - - /* new merge policy - if (upperBound == 0) - upperBound = 10; - */ - - for (int i = segmentInfos.size() - 1; i >= start; i--) { - int docCount = segmentInfos.info(i).docCount; - if (docCount <= lowerBound) { - return false; - } - - while (docCount > upperBound) { - lowerBound = upperBound; - upperBound *= mergeFactor; - } - } - return true; - } - // For test purposes. final synchronized int getBufferedDeleteTermsSize() { return bufferedDeleteTerms.size(); @@ -2438,6 +2210,7 @@ private void bufferDeleteTerm(Term term) { Num num = (Num) bufferedDeleteTerms.get(term); int numDoc = docWriter.getNumDocsInRAM(); + // System.err.println( "buf delete " + term ); if (num == null) { bufferedDeleteTerms.put(term, new Num(numDoc)); } else { @@ -2480,6 +2253,7 @@ Iterator iter = deleteTerms.entrySet().iterator(); while (iter.hasNext()) { Entry entry = (Entry) iter.next(); + // System.err.println("apply " + entry.getKey()); reader.deleteDocuments((Term) entry.getKey()); } } Index: src/java/org/apache/lucene/index/IndexFileDeleter.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileDeleter.java (revision 563241) +++ src/java/org/apache/lucene/index/IndexFileDeleter.java (working copy) @@ -122,6 +122,7 @@ throws CorruptIndexException, IOException { this.docWriter = docWriter; + infoStream = null; this.infoStream = infoStream; if (infoStream != null) Index: contrib/benchmark/conf/indexEnwiki.alg =================================================================== --- contrib/benchmark/conf/indexEnwiki.alg (revision 0) +++ contrib/benchmark/conf/indexEnwiki.alg (revision 0) @@ -0,0 +1,30 @@ + analyzer=org.apache.lucene.analysis.SimpleAnalyzer + + directory=FSDirectory + + ram.flush.mb=64 + + max.field.length=2147483647 + + compound=false + max.buffered=70000 + + doc.add.log.step=5000 + + docs.file=work/enwiki.txt + + doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker + + doc.tokenized=true + doc.maker.forever=false + + ResetSystemErase + CreateIndex + { "All" + {AddDoc}: * + } + + CloseIndex + + RepSumByPref All + RepSumByPref AddDoc Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (revision 563241) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java (working copy) @@ -20,6 +20,7 @@ import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; +import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; @@ -28,24 +29,23 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; +import java.io.BufferedWriter; import java.io.IOException; /** - * Extract the downloaded Wikipedia dump into separate files for indexing. + * Extract the downloaded Wikipedia dump into a single file for + * indexing. Each wikipedia document becomes one line in the result + * file. */ public class ExtractWikipedia { private File wikipedia; - private File outputDir; + private BufferedWriter output; - public ExtractWikipedia(File wikipedia, File outputDir) { + public ExtractWikipedia(File wikipedia, File output) + throws IOException { this.wikipedia = wikipedia; - this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - File [] files = outputDir.listFiles(); - for (int i = 0; i < files.length; i++) { - files[i].delete(); - } + this.output = new BufferedWriter( new FileWriter( output ) ); } static public int count = 0; @@ -55,8 +55,7 @@ public class Parser extends DefaultHandler { - public Parser() { - } + public Parser() {} StringBuffer contents = new StringBuffer(); @@ -91,45 +90,22 @@ } } - public File directory (int count, File directory) { - if (directory == null) { - directory = outputDir; - } - int base = BASE; - while (base <= count) { - base *= BASE; - } - if (count < BASE) { - return directory; - } - directory = new File (directory, (Integer.toString(base / BASE))); - directory = new File (directory, (Integer.toString(count / (base / BASE)))); - return directory(count % (base / BASE), directory); - } + public void create(String id, String title, String time, String body) + throws IOException { - public void create(String id, String title, String time, String body) { - - File d = directory(count++, null); - d.mkdirs(); - File f = new File(d, id + ".txt"); - - StringBuffer contents = new StringBuffer(); + id = id.replace('\t', ' '); + title = title.replace('\t', ' '); + time = time.replace('\t', ' '); + body = body.replaceAll("[\t\n]", " "); - contents.append(time); - contents.append("\n\n"); - contents.append(title); - contents.append("\n\n"); - contents.append(body); - contents.append("\n"); + output.write(title); + output.write('\t'); + output.write(time); + output.write('\t'); + output.write(body); + output.newLine(); + output.flush(); - try { - FileWriter writer = new FileWriter(f); - writer.write(contents.toString()); - writer.close(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } String time(String original) { @@ -147,7 +123,8 @@ return buffer.toString(); } - public void endElement(String namespace, String simple, String qualified) { + public void endElement(String namespace, String simple, String qualified) + throws SAXException { if (qualified.equals("title")) { title = contents.toString(); } else if (qualified.equals("text")) { @@ -162,7 +139,11 @@ id = contents.toString(); } else if (qualified.equals("page")) { if (body != null) { - create(id, title, time, body); + try { + create(id, title, time, body); + } catch ( IOException ioe ) { + throw new SAXException( ioe ); + } } } } @@ -187,7 +168,8 @@ } } - public static void main(String[] args) { + public static void main(String[] args) + throws IOException { if (args.length != 2) { printUsage(); } @@ -195,9 +177,8 @@ File wikipedia = new File(args[0]); if (wikipedia.exists()) { - File outputDir = new File(args[1]); - outputDir.mkdirs(); - ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, outputDir); + File output = new File(args[1]); + ExtractWikipedia extractor = new ExtractWikipedia(wikipedia, output); extractor.extract(); } else { printUsage(); Index: contrib/benchmark/build.xml =================================================================== --- contrib/benchmark/build.xml (revision 563241) +++ contrib/benchmark/build.xml (working copy) @@ -23,7 +23,7 @@