Index: lucene/join/src/java/org/apache/lucene/search/join/BytesRefIterable.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/join/src/java/org/apache/lucene/search/join/BytesRefIterable.java (revision ) +++ lucene/join/src/java/org/apache/lucene/search/join/BytesRefIterable.java (revision ) @@ -0,0 +1,26 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRefIterator; + +interface BytesRefIterable { + /** Returns an iterator (in sorted order) over a set of BytesRef elements */ + BytesRefIterator iterator(); +} + Index: lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java (revision 1448793) +++ lucene/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java (revision ) @@ -17,9 +17,6 @@ * limitations under the License. */ -import java.io.IOException; -import java.util.*; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; @@ -28,12 +25,10 @@ import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.DocTermOrds; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; @@ -54,13 +49,26 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.junit.Test; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + public class TestJoinUtil extends LuceneTestCase { + @Test public void testSimple() throws Exception { final String idField = "id"; final String toField = "productId"; @@ -143,6 +151,7 @@ dir.close(); } + @Test public void testSimpleWithScoring() throws Exception { final String idField = "id"; final String toField = "movieId"; @@ -326,7 +335,7 @@ } }); // Asserting bit set... - if (VERBOSE) { + if (true || VERBOSE) { System.out.println("expected cardinality:" + expectedResult.cardinality()); DocIdSetIterator iterator = expectedResult.iterator(); for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) { Index: lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java (revision 1448793) +++ lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java (revision ) @@ -24,10 +24,9 @@ import org.apache.lucene.search.Query; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.BytesRefIterator; import java.io.IOException; -import java.util.Comparator; /** * A query that has an array of terms from a specific field. This query will match documents have one or more terms in @@ -37,14 +36,14 @@ */ class TermsQuery extends MultiTermQuery { - private final BytesRefHash terms; + private final BytesRefIterable terms; private final Query fromQuery; // Used for equals() only /** * @param field The field that should contain terms that are specified in the previous parameter * @param terms The terms that matching documents should have. The terms must be sorted by natural order. */ - TermsQuery(String field, Query fromQuery, BytesRefHash terms) { + TermsQuery(String field, Query fromQuery, BytesRefIterable terms) { super(field); this.fromQuery = fromQuery; this.terms = terms; @@ -52,10 +51,6 @@ @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - if (this.terms.size() == 0) { - return TermsEnum.EMPTY; - } - return new SeekingTermSetTermsEnum(terms.iterator(null), this.terms); } @@ -91,76 +86,55 @@ return result; } + // this intersects one termsenum with another... static class SeekingTermSetTermsEnum extends FilteredTermsEnum { + private BytesRef current; // from the iterator + private final BytesRefIterator iterator; - private final BytesRefHash terms; - private final int[] ords; - private final int lastElement; - private final BytesRef lastTerm; - private final BytesRef spare = new BytesRef(); - private final Comparator comparator; - - private BytesRef seekTerm; - private int upto = 0; - - SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms) { + SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefIterable terms) { super(tenum); - this.terms = terms; - - lastElement = terms.size() - 1; - ords = terms.sort(comparator = tenum.getComparator()); - lastTerm = terms.get(ords[lastElement], new BytesRef()); - seekTerm = terms.get(ords[upto], spare); + this.iterator = terms.iterator(); } @Override protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { - BytesRef temp = seekTerm; - seekTerm = null; - return temp; + if (currentTerm == null) { + // first term + return current = iterator.next(); + } else { + do { + if (current.compareTo(currentTerm) > 0) { + return current; - } + } + } while ((current = iterator.next()) != null); + return null; + } + } @Override protected AcceptStatus accept(BytesRef term) throws IOException { - if (comparator.compare(term, lastTerm) > 0) { + if (current == null) { return AcceptStatus.END; } - + - BytesRef currentTerm = terms.get(ords[upto], spare); - if (comparator.compare(term, currentTerm) == 0) { - if (upto == lastElement) { - return AcceptStatus.YES; - } else { - seekTerm = terms.get(ords[++upto], spare); + int cmp = term.compareTo(current); + if (cmp == 0) { - return AcceptStatus.YES_AND_SEEK; + return AcceptStatus.YES_AND_SEEK; - } + } else if (cmp < 0) { + return AcceptStatus.NO_AND_SEEK; } else { - if (upto == lastElement) { - return AcceptStatus.NO; - } else { // Our current term doesn't match the the given term. - int cmp; - do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. - if (upto == lastElement) { - return AcceptStatus.NO; - } - // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of - // our terms so we don't do a binary search here - seekTerm = terms.get(ords[++upto], spare); - } while ((cmp = comparator.compare(seekTerm, term)) < 0); + // we need to catch up + while ((current = iterator.next()) != null) { + cmp = term.compareTo(current); if (cmp == 0) { - if (upto == lastElement) { - return AcceptStatus.YES; + return AcceptStatus.YES; + } else if (cmp < 0) { + return AcceptStatus.NO_AND_SEEK; // will be returned by nextSeekTerm() - } + } - seekTerm = terms.get(ords[++upto], spare); - return AcceptStatus.YES_AND_SEEK; - } else { - return AcceptStatus.NO_AND_SEEK; - } + } + return AcceptStatus.END; - } - } - } + } + } + } - - } - } Index: lucene/join/src/java/org/apache/lucene/search/join/TermsCollector.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/join/src/java/org/apache/lucene/search/join/TermsCollector.java (revision 1449668) +++ lucene/join/src/java/org/apache/lucene/search/join/TermsCollector.java (revision ) @@ -17,17 +17,27 @@ * limitations under the License. */ -import java.io.IOException; - +import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.MultiTermsEnum; +import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; +import org.apache.lucene.index.ReaderSlice; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.SortedSetDocValuesTermsEnum; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.Collector; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.Scorer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.OpenBitSet; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + /** * A collector that collects all terms from a specified field matching the query. * @@ -42,8 +52,8 @@ this.field = field; } - public BytesRefHash getCollectorTerms() { - return collectorTerms; + public BytesRefIterable getCollectorTerms() { + return new BytesRefHashIterable(collectorTerms); } @Override @@ -68,7 +78,7 @@ // impl that works with multiple values per document static class MV extends TermsCollector { - final BytesRef scratch = new BytesRef(); + private SortedSetDocValues docTermOrds; MV(String field) { @@ -80,16 +90,49 @@ docTermOrds.setDocument(doc); long ord; while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { - docTermOrds.lookupOrd(ord, scratch); - collectorTerms.add(scratch); + currentBits.set((int)ord); } } - + + // per-segment resources: if this works well e.g. we'd move to TermsCollector base class maybe + private final List bits = new ArrayList(); + private final List values = new ArrayList(); + private OpenBitSet currentBits; // current bits for speed + @Override public void setNextReader(AtomicReaderContext context) throws IOException { docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), field); + currentBits = new OpenBitSet(docTermOrds.getValueCount()); + values.add(docTermOrds); + bits.add(currentBits); } + + @Override + public BytesRefIterable getCollectorTerms() { + return new BytesRefIterable() { + + @Override + public BytesRefIterator iterator() { + int numSubs = values.size(); + ReaderSlice slices[] = new ReaderSlice[numSubs]; + TermsEnumIndex indexes[] = new TermsEnumIndex[numSubs]; + for (int i = 0; i < slices.length; i++) { + slices[i] = new ReaderSlice(0, 0, i); + TermsEnum te = new DocValuesConsumer.BitsFilteredTermsEnum(new SortedSetDocValuesTermsEnum(values.get(i)), bits.get(i)); + indexes[i] = new TermsEnumIndex(te, i); - } + } + MultiTermsEnum mte = new MultiTermsEnum(slices); + try { + mte.reset(indexes); + } catch (IOException nocommit) { + throw new RuntimeException(nocommit); + } + return mte; + } + }; + } + } + // impl that works with single value per document static class SV extends TermsCollector { Index: lucene/join/src/java/org/apache/lucene/search/join/BytesRefHashIterable.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/join/src/java/org/apache/lucene/search/join/BytesRefHashIterable.java (revision ) +++ lucene/join/src/java/org/apache/lucene/search/join/BytesRefHashIterable.java (revision ) @@ -0,0 +1,63 @@ +package org.apache.lucene.search.join; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.BytesRefIterator; + +import java.io.IOException; +import java.util.Comparator; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +class BytesRefHashIterable implements BytesRefIterable { + final BytesRefHash hash; + final int ords[]; + final int count; + + BytesRefHashIterable(BytesRefHash hash) { + this.hash = hash; + this.ords = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + this.count = hash.size(); + } + + @Override + public BytesRefIterator iterator() { + return new BytesRefIterator() { + int currentOrd = 0; + BytesRef scratch = new BytesRef(); + + @Override + public BytesRef next() throws IOException { + if (currentOrd == count) { + return null; + } else { + int ord = ords[currentOrd]; + hash.get(ord, scratch); + currentOrd++; + return scratch; + } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + }; + } +} + Index: lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java (revision 1448793) +++ lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java (revision ) @@ -17,12 +17,6 @@ * limitations under the License. */ -import java.io.Closeable; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfo; @@ -41,6 +35,12 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.OpenBitSet; +import java.io.Closeable; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + /** * Abstract API that consumes numeric, binary and * sorted docvalues. Concrete implementations of this @@ -607,10 +607,11 @@ } // TODO: seek-by-ord to nextSetBit - static class BitsFilteredTermsEnum extends FilteredTermsEnum { + public static class BitsFilteredTermsEnum extends FilteredTermsEnum { + final OpenBitSet liveTerms; - BitsFilteredTermsEnum(TermsEnum in, OpenBitSet liveTerms) { + public BitsFilteredTermsEnum(TermsEnum in, OpenBitSet liveTerms) { super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! assert liveTerms != null; this.liveTerms = liveTerms; Index: lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 1448793) +++ lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision ) @@ -49,7 +49,7 @@ private BytesRef current; private Comparator termComp; - static class TermsEnumIndex { + public static class TermsEnumIndex { public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; final int subIndex; final TermsEnum termsEnum;