Index: lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java =================================================================== --- lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java (revision 1137067) +++ lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java (working copy) @@ -19,113 +19,114 @@ import java.io.IOException; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.store.Directory; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.TermRangeFilter; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.OpenBitSetDISI; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** - * Split an index based on a given primary key term - * and a 'middle' term. If the middle term is present, it's - * sent to dir2. + * Split an index based on a {@link Filter}. */ public class PKIndexSplitter { - private Term midTerm; - Directory input; - Directory dir1; - Directory dir2; + private final Filter docsInFirstIndex; + private final Directory input; + private final Directory dir1; + private final Directory dir2; - public PKIndexSplitter(Term midTerm, Directory input, - Directory dir1, Directory dir2) { - this.midTerm = midTerm; + /** + * Split an index based on a {@link Filter}. All documents that match the filter + * are sent to dir1, remaining ones to dir2. + */ + public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Filter docsInFirstIndex) { this.input = input; this.dir1 = dir1; this.dir2 = dir2; + this.docsInFirstIndex = docsInFirstIndex; } + /** + * Split an index based on a given primary key term + * and a 'middle' term. If the middle term is present, it's + * sent to dir2. + */ + public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Term midTerm) { + this(input, dir1, dir2, + new TermRangeFilter(midTerm.field(), null, midTerm.bytes(), true, false)); + } + public void split() throws IOException { + boolean success = false; IndexReader reader = IndexReader.open(input); - OpenBitSet lowDels = setDeletes(reader, null, midTerm.bytes()); - OpenBitSet hiDels = setDeletes(reader, midTerm.bytes(), null); - - createIndex(dir1, reader, lowDels); - createIndex(dir2, reader, hiDels); - reader.close(); + try { + createIndex(dir1, reader, docsInFirstIndex, false); + createIndex(dir2, reader, docsInFirstIndex, true); + success = true; + } finally { + IOUtils.closeSafely(!success, reader); + } } - private void createIndex(Directory target, IndexReader reader, OpenBitSet bv) throws IOException { + private void createIndex(Directory target, IndexReader reader, Filter preserveFilter, boolean negateFilter) throws IOException { + boolean success = false; IndexWriter w = new IndexWriter(target, new IndexWriterConfig( - Version.LUCENE_CURRENT, - new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) - .setOpenMode(OpenMode.CREATE)); - w.addIndexes(new DeletesIndexReader(reader, bv)); - w.close(); - } - - private OpenBitSet setDeletes(IndexReader reader, BytesRef startTerm, - BytesRef endTermExcl) throws IOException { - OpenBitSet incl = new OpenBitSet(reader.maxDoc()); - Terms terms = MultiFields.getTerms(reader, midTerm.field()); - TermsEnum te = terms.iterator(); - if (startTerm != null) { - te.seek(startTerm); + Version.LUCENE_CURRENT, null).setOpenMode(OpenMode.CREATE)); + try { + w.addIndexes(new DocumentFilteredIndexReader(reader, preserveFilter, negateFilter)); + success = true; + } finally { + IOUtils.closeSafely(!success, w); } - while (true) { - final BytesRef term = te.next(); - if (term == null) { - break; - } - if (endTermExcl != null && term.compareTo(endTermExcl) >= 0) { - break; - } - DocsEnum docs = MultiFields.getTermDocsEnum(reader, - MultiFields.getDeletedDocs(reader), midTerm.field(), term); - while (true) { - final int doc = docs.nextDoc(); - if (doc != DocsEnum.NO_MORE_DOCS) { - incl.set(doc); - } else break; - } - } - OpenBitSet dels = new OpenBitSet(reader.maxDoc()); - for (int x=0; x < reader.maxDoc(); x++) { - if (!incl.get(x)) { - dels.set(x); - } - } - return dels; } - - public static class DeletesIndexReader extends FilterIndexReader { - OpenBitSet readerDels; - public DeletesIndexReader(IndexReader reader, OpenBitSet deletes) { + public static class DocumentFilteredIndexReader extends FilterIndexReader { + final Bits readerDels; + final int numDocs; + + public DocumentFilteredIndexReader(IndexReader reader, Filter preserveFilter, boolean negateFilter) throws IOException { super(new SlowMultiReaderWrapper(reader)); - readerDels = new OpenBitSet(reader.maxDoc()); + + final OpenBitSetDISI bits = new OpenBitSetDISI(in.maxDoc()); + final DocIdSet docs = preserveFilter.getDocIdSet((AtomicReaderContext) in.getTopReaderContext()); + if (docs != null) { + final DocIdSetIterator it = docs.iterator(); + if (it != null) { + bits.inPlaceOr(it); + } + } + // this is somehow inverse, if we negate the filter, we delete all documents it matches! + if (!negateFilter) { + bits.flip(0, in.maxDoc()); + } + if (in.hasDeletions()) { - final Bits oldDelBits = MultiFields.getDeletedDocs(in); + final Bits oldDelBits = in.getDeletedDocs(); assert oldDelBits != null; for (int i = 0; i < in.maxDoc(); i++) { - if (oldDelBits.get(i) || deletes.get(i)) { - readerDels.set(i); + if (oldDelBits.get(i)) { + bits.set(i); } } - } else { - readerDels = deletes; } + + this.readerDels = bits; + this.numDocs = in.maxDoc() - (int) bits.cardinality(); } @Override public int numDocs() { - return in.maxDoc() - (int)readerDels.cardinality(); + return numDocs; } @Override public boolean hasDeletions() { - return (int)readerDels.cardinality() > 0; + return (in.maxDoc() != numDocs); } @Override Index: lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java =================================================================== --- lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java (revision 1137067) +++ lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java (working copy) @@ -20,73 +20,95 @@ import java.text.DecimalFormat; import java.text.NumberFormat; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; -import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.Version; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.LuceneTestCase; +public class TestPKIndexSplitter extends LuceneTestCase { -public class TestPKIndexSplitter extends LuceneTestCase { - public void testSplit() throws Exception { + public void testSplit() throws Exception { NumberFormat format = new DecimalFormat("000000000"); - Directory dir = newDirectory(); - IndexWriter w = new IndexWriter(dir, new IndexWriterConfig( - Version.LUCENE_CURRENT, - new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)) .setOpenMode(OpenMode.CREATE)); - for (int x=0; x < 10; x++) { + for (int x = 0; x < 11; x++) { Document doc = createDocument(x, "1", 3, format); w.addDocument(doc); } - for (int x=15; x < 20; x++) { + for (int x = 11; x < 20; x++) { Document doc = createDocument(x, "2", 3, format); w.addDocument(doc); } w.close(); + final Term midTerm = new Term("id", format.format(11)); + + checkSplitting(dir, midTerm, 11, 9); + + // delete some documents + w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)) + .setOpenMode(OpenMode.APPEND)); + w.deleteDocuments(midTerm); + w.deleteDocuments(new Term("id", format.format(2))); + w.close(); + + checkSplitting(dir, midTerm, 10, 8); + + dir.close(); + } + + private void checkSplitting(Directory dir, Term splitTerm, int leftCount, int rightCount) throws Exception { Directory dir1 = newDirectory(); Directory dir2 = newDirectory(); - Term splitTerm = new Term("id", new BytesRef(format.format(11))); - PKIndexSplitter splitter = new PKIndexSplitter(splitTerm, - dir, dir1, dir2); + PKIndexSplitter splitter = new PKIndexSplitter(dir, dir1, dir2, splitTerm); splitter.split(); IndexReader ir1 = IndexReader.open(dir1); IndexReader ir2 = IndexReader.open(dir2); - assertEquals(10, ir1.maxDoc()); - assertEquals(4, ir2.maxDoc()); + assertEquals(leftCount, ir1.numDocs()); + assertEquals(rightCount, ir2.numDocs()); + checkContents(ir1, "1"); + checkContents(ir2, "2"); + ir1.close(); ir2.close(); dir1.close(); dir2.close(); - dir.close(); } - public Document createDocument(int n, String indexName, + private void checkContents(IndexReader ir, String indexname) throws Exception { + final Bits delDocs = MultiFields.getDeletedDocs(ir); + for (int i = 0; i < ir.maxDoc(); i++) { + if (delDocs == null || !delDocs.get(i)) { + assertEquals(indexname, ir.document(i).get("indexname")); + } + } + } + + private Document createDocument(int n, String indexName, int numFields, NumberFormat format) { StringBuilder sb = new StringBuilder(); Document doc = new Document(); String id = format.format(n); - doc.add(new Field("id", id, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); - doc.add(new Field("indexname", indexName, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(newField("id", id, Store.YES, Index.NOT_ANALYZED)); + doc.add(newField("indexname", indexName, Store.YES, Index.NOT_ANALYZED)); sb.append("a"); sb.append(n); - doc.add(new Field("field1", sb.toString(), Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(newField("field1", sb.toString(), Store.YES, Index.ANALYZED)); sb.append(" b"); sb.append(n); for (int i = 1; i < numFields; i++) { - doc.add(new Field("field" + (i + 1), sb.toString(), Store.YES, - Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(newField("field" + (i + 1), sb.toString(), Store.YES, Index.ANALYZED)); } return doc; }