--- lucene/contrib/CHANGES.txt 2011-06-16 19:56:08.496774800 +0200 +++ lucene/contrib/CHANGES.txt 2011-06-18 12:49:39.980077700 +0200 @@ -29,6 +29,10 @@ facilitate doing grouping in a distributed environment (Uwe Schindler, Mike McCandless) + * LUCENE-2919: Added PKIndexSplitter, that splits an index according + to a middle term in a specified field. (Jason Rutherglen via Mike + McCandless, Uwe Schindler) + API Changes * LUCENE-3141: add getter method to access fragInfos in FieldFragList. --- lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java 1970-01-01 01:00:00.000000000 +0100 +++ lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java 2011-06-18 12:53:48.528293900 +0200 @@ -0,0 +1,155 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.TermRangeFilter; +import org.apache.lucene.util.OpenBitSetDISI; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.Version; + +/** + * Split an index based on a {@link Filter}. + */ +public class PKIndexSplitter { + private final Filter docsInFirstIndex; + private final Directory input; + private final Directory dir1; + private final Directory dir2; + + /** + * Split an index based on a {@link Filter}. All documents that match the filter + * are sent to dir1, remaining ones to dir2. + */ + public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Filter docsInFirstIndex) { + this.input = input; + this.dir1 = dir1; + this.dir2 = dir2; + this.docsInFirstIndex = docsInFirstIndex; + } + + /** + * Split an index based on a given primary key term + * and a 'middle' term. If the middle term is present, it's + * sent to dir2. + */ + public PKIndexSplitter(Directory input, Directory dir1, Directory dir2, Term midTerm) { + this(input, dir1, dir2, + new TermRangeFilter(midTerm.field(), null, midTerm.text(), true, false)); + } + + public void split() throws IOException { + boolean success = false; + IndexReader reader = IndexReader.open(input); + try { + createIndex(dir1, reader, docsInFirstIndex, false); + createIndex(dir2, reader, docsInFirstIndex, true); + success = true; + } finally { + IOUtils.closeSafely(!success, reader); + } + } + + private void createIndex(Directory target, IndexReader reader, Filter preserveFilter, boolean negateFilter) throws IOException { + boolean success = false; + IndexWriter w = new IndexWriter(target, new IndexWriterConfig( + Version.LUCENE_CURRENT, null).setOpenMode(OpenMode.CREATE)); + try { + w.addIndexes(new DocumentFilteredIndexReader(reader, preserveFilter, negateFilter)); + success = true; + } finally { + IOUtils.closeSafely(!success, w); + } + } + + public static class DocumentFilteredIndexReader extends FilterIndexReader { + final OpenBitSetDISI readerDels; + final int numDocs; + + public DocumentFilteredIndexReader(IndexReader reader, Filter preserveFilter, boolean negateFilter) throws IOException { + super(reader); + + final OpenBitSetDISI bits = new OpenBitSetDISI(in.maxDoc()); + final DocIdSet docs = preserveFilter.getDocIdSet(in); + if (docs != null) { + final DocIdSetIterator it = docs.iterator(); + if (it != null) { + bits.inPlaceOr(it); + } + } + // this is somehow inverse, if we negate the filter, we delete all documents it matches! + if (!negateFilter) { + bits.flip(0, in.maxDoc()); + } + + if (in.hasDeletions()) { + for (int i = 0; i < in.maxDoc(); i++) { + if (in.isDeleted(i)) { + bits.set(i); + } + } + } + + this.readerDels = bits; + this.numDocs = in.maxDoc() - (int) bits.cardinality(); + } + + @Override + public int numDocs() { + return numDocs; + } + + @Override + public boolean hasDeletions() { + return (in.maxDoc() != numDocs); + } + + @Override + public boolean isDeleted(int n) { + return readerDels.get(n); + } + + @Override + public IndexReader[] getSequentialSubReaders() { + return null; + } + + @Override + public TermPositions termPositions() throws IOException { + return new FilterTermPositions(in.termPositions()) { + + @Override + public boolean next() throws IOException { + boolean res; + while ((res = super.next())) { + if (!readerDels.get(doc())) { + break; + } + } + return res; + } + }; + } + } +} --- lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java 1970-01-01 01:00:00.000000000 +0100 +++ lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java 2011-06-18 12:54:55.592129700 +0200 @@ -0,0 +1,113 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestPKIndexSplitter extends LuceneTestCase { + + public void testSplit() throws Exception { + NumberFormat format = new DecimalFormat("000000000"); + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)) + .setOpenMode(OpenMode.CREATE)); + for (int x = 0; x < 11; x++) { + Document doc = createDocument(x, "1", 3, format); + w.addDocument(doc); + } + for (int x = 11; x < 20; x++) { + Document doc = createDocument(x, "2", 3, format); + w.addDocument(doc); + } + w.close(); + + final Term midTerm = new Term("id", format.format(11)); + + checkSplitting(dir, midTerm, 11, 9); + + // delete some documents + w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)) + .setOpenMode(OpenMode.APPEND)); + w.deleteDocuments(midTerm); + w.deleteDocuments(new Term("id", format.format(2))); + w.close(); + + checkSplitting(dir, midTerm, 10, 8); + + dir.close(); + } + + private void checkSplitting(Directory dir, Term splitTerm, int leftCount, int rightCount) throws Exception { + Directory dir1 = newDirectory(); + Directory dir2 = newDirectory(); + PKIndexSplitter splitter = new PKIndexSplitter(dir, dir1, dir2, splitTerm); + splitter.split(); + + IndexReader ir1 = IndexReader.open(dir1); + IndexReader ir2 = IndexReader.open(dir2); + assertEquals(leftCount, ir1.numDocs()); + assertEquals(rightCount, ir2.numDocs()); + + checkContents(ir1, "1"); + checkContents(ir2, "2"); + + ir1.close(); + ir2.close(); + + dir1.close(); + dir2.close(); + } + + private void checkContents(IndexReader ir, String indexname) throws Exception { + for (int i = 0; i < ir.maxDoc(); i++) { + if (!ir.isDeleted(i)) { + assertEquals(indexname, ir.document(i).get("indexname")); + } + } + } + + private Document createDocument(int n, String indexName, + int numFields, NumberFormat format) { + StringBuilder sb = new StringBuilder(); + Document doc = new Document(); + String id = format.format(n); + doc.add(newField("id", id, Store.YES, Index.NOT_ANALYZED)); + doc.add(newField("indexname", indexName, Store.YES, Index.NOT_ANALYZED)); + sb.append("a"); + sb.append(n); + doc.add(newField("field1", sb.toString(), Store.YES, Index.ANALYZED)); + sb.append(" b"); + sb.append(n); + for (int i = 1; i < numFields; i++) { + doc.add(newField("field" + (i + 1), sb.toString(), Store.YES, Index.ANALYZED)); + } + return doc; + } +}