Index: modules/join/src/java/org/apache/lucene/search/join/query/JoinUtil.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/query/JoinUtil.java (revision ) +++ modules/join/src/java/org/apache/lucene/search/join/query/JoinUtil.java (revision ) @@ -0,0 +1,64 @@ +package org.apache.lucene.search.join.query; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.*; +import org.apache.lucene.search.join.query.TermsCollector; +import org.apache.lucene.search.join.query.TermsQuery; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Utility for query time joining using {@link TermsQuery} and {@link TermsCollector}. + * + * @lucene.experimental + */ +public final class JoinUtil { + + // No instances allowed + private JoinUtil() { + } + + /** + * Convenience method for query time joining. + * + * Execute the returned query with a {@link IndexSearcher} to retrieve all documents that have the same terms in the + * to field that match with documents matching the specified fromQuery and have the same terms in the from field. + * + * @param fromField The from field to join from + * @param multipleValuesPerDocument Whether the from field has multiple terms per document + * @param toField The to field to join to + * @param fromQuery The query to match documents on the from side + * @param fromSearcher The searcher that executed the specified fromQuery + * @return a {@link MultiTermQuery} instance that can be used to join documents based on the + * terms in the from and to field + * @throws IOException If I/O related errors occur + */ + public static MultiTermQuery createJoinQuery(String fromField, + boolean multipleValuesPerDocument, + String toField, + Query fromQuery, + IndexSearcher fromSearcher) throws IOException { + TermsCollector termsCollector = TermsCollector.create(fromField, multipleValuesPerDocument); + fromSearcher.search(fromQuery, termsCollector); + BytesRef[] terms = termsCollector.getCollectedTerms(); + return new TermsQuery(terms, toField); + } + +} Index: modules/join/src/java/org/apache/lucene/search/join/query/package.html =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/query/package.html (revision ) +++ modules/join/src/java/org/apache/lucene/search/join/query/package.html (revision ) @@ -0,0 +1,60 @@ + + + +
+ + + ++ This package contains classes to support term based query time joining. Terms are fetched from a fromField and a + toField. +
++ The query time joining is implemented as two pass search. The first pass collects all the terms from the fromField + that match the fromQuery. The second pass returns all documents that have matching terms in the toField to the terms + collected in the first pass. +
+Query time joining has the following input:
+fromField: The from field to join from.
+ fromQuery: The query executed to collect the from terms. This is usually the user specified query.
+ multipleValuesPerDocument: Whether the fromField contains more than one value per document
+ toField: The to field to join to
+
+ The joining depends on the {@link org.apache.lucene.search.join.query.TermsCollector} and
+ {@link org.apache.lucene.search.join.query.TermsQuery}. The TermsCollector collects the the from terms
+ matching the fromQuery. The TermsQuery is responsible for matching the documents that have matching terms
+ in the toField with the collected terms from the TermsCollector. There is also a JoinUtil
+ utility class to simplify the joining.
+
+ Example usage of the {@link org.apache.lucene.search.join.query.JoinUtil#createJoinQuery(String, boolean, String, org.apache.lucene.search.Query, org.apache.lucene.search.IndexSearcher)} : +
+
+ String fromField = "from"; // Name of the from field
+ boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
+ String fromField = "to"; // Name of the to field
+ Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
+
+ MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);
+ TopDocs topDocs = toSearcher.search(joinQuery, 10); // Note: toSearcher can be the same as the fromSearcher
+ // Render topDocs...
+
+
+
Index: modules/join/src/java/org/apache/lucene/search/join/query/TermsQuery.java
===================================================================
--- modules/join/src/java/org/apache/lucene/search/join/query/TermsQuery.java (revision )
+++ modules/join/src/java/org/apache/lucene/search/join/query/TermsQuery.java (revision )
@@ -0,0 +1,120 @@
+package org.apache.lucene.search.join.query;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.FilteredTermsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+
+/**
+ * A query that has an array of terms from a specific field. This query will match documents have one or more terms in
+ * the specified field that match with the terms specified in the array.
+ *
+ * @lucene.experimental
+ */
+public class TermsQuery extends MultiTermQuery {
+
+ private final BytesRef[] terms;
+ private TermsEnum reuse = null;
+
+ /**
+ * @param terms The terms that matching documents should have. The terms must be sorted by natural order.
+ * @param field The field that should contain terms that are specified in the previous parameter
+ */
+ public TermsQuery(BytesRef[] terms, String field) {
+ super(field);
+ this.terms = terms;
+ }
+
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ if (this.terms.length == 0) {
+ return TermsEnum.EMPTY;
+ }
+
+ TermsEnum iter = terms.iterator(reuse);
+ return new SeekingTermSetTermsEnum(iter, this.terms);
+ }
+
+ public String toString(String string) {
+ return "TermsQuery{" +
+ "field=" + field +
+ '}';
+ }
+
+ static class SeekingTermSetTermsEnum extends FilteredTermsEnum {
+
+ private final BytesRef[] terms;
+ private final int lastElement;
+
+ private BytesRef seekTerm;
+ private int upto = 0;
+
+ SeekingTermSetTermsEnum(TermsEnum tenum, BytesRef[] terms) throws IOException {
+ super(tenum);
+ this.terms = terms;
+ lastElement = this.terms.length - 1;
+ seekTerm = terms[upto];
+ }
+
+ @Override
+ protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
+ BytesRef temp = seekTerm;
+ seekTerm = null;
+ return temp;
+ }
+
+ protected AcceptStatus accept(BytesRef term) throws IOException {
+ if (term.compareTo(terms[lastElement]) > 0) {
+ return AcceptStatus.END;
+ }
+
+ if (terms[upto].equals(term)) {
+ if (upto == lastElement) {
+ return AcceptStatus.YES;
+ } else {
+ seekTerm = terms[++upto];
+ return AcceptStatus.YES_AND_SEEK;
+ }
+ } else {
+ if (upto == lastElement) {
+ return AcceptStatus.NO;
+ } else {
+ int cmp;
+ do {
+ if (upto == lastElement) {
+ return AcceptStatus.NO;
+ }
+ seekTerm = terms[++upto]; // Maybe something smarter? Like binary search?
+ } while ((cmp = getComparator().compare(seekTerm, term)) < 0);
+ if (cmp == 0) {
+ return AcceptStatus.YES;
+ } else {
+ return AcceptStatus.NO_AND_SEEK;
+ }
+ }
+ }
+ }
+
+ }
+
+}
Index: modules/join/src/java/org/apache/lucene/search/join/query/TermsCollector.java
===================================================================
--- modules/join/src/java/org/apache/lucene/search/join/query/TermsCollector.java (revision )
+++ modules/join/src/java/org/apache/lucene/search/join/query/TermsCollector.java (revision )
@@ -0,0 +1,135 @@
+package org.apache.lucene.search.join.query;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.DocTermOrds;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+
+import java.io.IOException;
+
+/**
+ * A collector that collects all terms from a specified field matching the query.
+ *
+ * @lucene.experimental
+ */
+public abstract class TermsCollector extends Collector {
+
+ protected final String field;
+ protected final BytesRefHash collecterTerms = new BytesRefHash();
+ protected final BytesRef spare = new BytesRef();
+
+ protected TermsCollector(String field) {
+ this.field = field;
+ }
+
+ /**
+ * Returns the collected terms as a sorted array in natural order.
+ *
+ * Note: Can only be invoked once per instance.
+ *
+ * @return the collected terms as a sorted array in natural order
+ */
+ public BytesRef[] getCollectedTerms() {
+ BytesRef[] terms = new BytesRef[collecterTerms.size()];
+ int[] ords = collecterTerms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ for (int i = 0; i < terms.length; i++) {
+ terms[i] = collecterTerms.get(ords[i], new BytesRef());
+ }
+ return terms;
+ }
+
+ public void setScorer(Scorer scorer) throws IOException {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+
+ /**
+ * Chooses the right {@link TermsCollector} implementation.
+ *
+ * @param field The field to collect terms for
+ * @param multipleValuesPerDocument Whether the field to collect terms for has multiple values per document.
+ * @return a {@link TermsCollector} instance
+ */
+ public static TermsCollector create(String field, boolean multipleValuesPerDocument) {
+ return multipleValuesPerDocument ? new MV(field) : new SV(field);
+ }
+
+ // impl that works with multiple values per document
+ static class MV extends TermsCollector {
+
+ private DocTermOrds docTermOrds;
+ private TermsEnum docTermsEnum;
+ private DocTermOrds.TermOrdsIterator reuse;
+
+ MV(String field) {
+ super(field);
+ }
+
+ public void collect(int doc) throws IOException {
+ reuse = docTermOrds.lookup(doc, reuse);
+ int[] buffer = new int[5];
+
+ int chunk;
+ do {
+ chunk = reuse.read(buffer);
+ if (chunk == 0) {
+ return;
+ }
+
+ for (int idx = 0; idx < chunk; idx++) {
+ int key = buffer[idx];
+ docTermsEnum.seekExact((long) key);
+ collecterTerms.add(docTermsEnum.term());
+ }
+ } while (chunk >= buffer.length);
+ }
+
+ public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+ docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader, field);
+ docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader);
+ reuse = null; // LUCENE-3377 needs to be fixed first then this statement can be removed...
+ }
+ }
+
+ // impl that works with single value per document
+ static class SV extends TermsCollector {
+
+ private FieldCache.DocTerms fromDocTerms;
+
+ SV(String field) {
+ super(field);
+ }
+
+ public void collect(int doc) throws IOException {
+ collecterTerms.add(fromDocTerms.getTerm(doc, spare));
+ }
+
+ public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+ fromDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field);
+ }
+ }
+
+}
Index: modules/join/src/test/org/apache/lucene/search/join/query/TestJoinUtil.java
===================================================================
--- modules/join/src/test/org/apache/lucene/search/join/query/TestJoinUtil.java (revision )
+++ modules/join/src/test/org/apache/lucene/search/join/query/TestJoinUtil.java (revision )
@@ -0,0 +1,357 @@
+package org.apache.lucene.search.join.query;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.*;
+
+public class TestJoinUtil extends LuceneTestCase {
+
+ public void testSimple() throws Exception {
+ final String idField = "id";
+ final String toField = "productId";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+
+ // 0
+ Document doc = new Document();
+ doc.add(new Field("description", "random text", TextField.TYPE_STORED));
+ doc.add(new Field("name", "name1", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "1", TextField.TYPE_STORED));
+ w.addDocument(doc);
+
+ // 1
+ doc = new Document();
+ doc.add(new Field("price", "10.0", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "2", TextField.TYPE_STORED));
+ doc.add(new Field(toField, "1", TextField.TYPE_STORED));
+ w.addDocument(doc);
+
+ // 2
+ doc = new Document();
+ doc.add(new Field("price", "20.0", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "3", TextField.TYPE_STORED));
+ doc.add(new Field(toField, "1", TextField.TYPE_STORED));
+ w.addDocument(doc);
+
+ // 3
+ doc = new Document();
+ doc.add(new Field("description", "more random text", TextField.TYPE_STORED));
+ doc.add(new Field("name", "name2", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "4", TextField.TYPE_STORED));
+ w.addDocument(doc);
+ w.commit();
+
+ // 4
+ doc = new Document();
+ doc.add(new Field("price", "10.0", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "5", TextField.TYPE_STORED));
+ doc.add(new Field(toField, "4", TextField.TYPE_STORED));
+ w.addDocument(doc);
+
+ // 5
+ doc = new Document();
+ doc.add(new Field("price", "20.0", TextField.TYPE_STORED));
+ doc.add(new Field(idField, "6", TextField.TYPE_STORED));
+ doc.add(new Field(toField, "4", TextField.TYPE_STORED));
+ w.addDocument(doc);
+
+ IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ w.close();
+
+ // Search for product
+ MultiTermQuery joinQuery =
+ JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name2")), indexSearcher);
+
+ TopDocs result = indexSearcher.search(joinQuery, 10);
+ assertEquals(2, result.totalHits);
+ assertEquals(4, result.scoreDocs[0].doc);
+ assertEquals(5, result.scoreDocs[1].doc);
+
+ joinQuery = JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name1")), indexSearcher);
+ result = indexSearcher.search(joinQuery, 10);
+ assertEquals(2, result.totalHits);
+ assertEquals(1, result.scoreDocs[0].doc);
+ assertEquals(2, result.scoreDocs[1].doc);
+
+ // Search for offer
+ joinQuery = JoinUtil.createJoinQuery(toField, false, idField, new TermQuery(new Term("id", "5")), indexSearcher);
+ result = indexSearcher.search(joinQuery, 10);
+ assertEquals(1, result.totalHits);
+ assertEquals(3, result.scoreDocs[0].doc);
+
+ indexSearcher.getIndexReader().close();
+ dir.close();
+ }
+
+ @Test
+ public void testSingleValueRandomJoin() throws Exception {
+ int maxIndexIter = _TestUtil.nextInt(random, 6, 12);
+ int maxSearchIter = _TestUtil.nextInt(random, 13, 26);
+ executeRandomJoin(false, maxIndexIter, maxSearchIter);
+ }
+
+ @Test
+ // This test really takes more time, that is why the number of iterations are smaller.
+ public void testMultiValueRandomJoin() throws Exception {
+ int maxIndexIter = _TestUtil.nextInt(random, 3, 6);
+ int maxSearchIter = _TestUtil.nextInt(random, 6, 12);
+ executeRandomJoin(true, maxIndexIter, maxSearchIter);
+ }
+
+ private void executeRandomJoin(boolean multipleValuesPerDocument, int maxIndexIter, int maxSearchIter) throws Exception {
+ for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) {
+ if (VERBOSE) {
+ System.out.println("indexIter=" + indexIter);
+ }
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())
+ );
+ int numberOfDocumentsToIndex = _TestUtil.nextInt(random, 87, 764);
+ IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument);
+
+ IndexReader topLevelReader = w.getReader();
+ w.close();
+ for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) {
+ if (VERBOSE) {
+ System.out.println("searchIter=" + searchIter);
+ }
+ IndexSearcher indexSearcher = newSearcher(topLevelReader);
+
+ int r = random.nextInt(context.randomUniqueValues.length);
+ boolean from = context.randomFrom[r];
+ String randomValue = context.randomUniqueValues[r];
+ FixedBitSet expectedResult = createExpectedResult(randomValue, from, indexSearcher.getIndexReader(), context);
+
+ Query actualQuery = new TermQuery(new Term("value", randomValue));
+ if (VERBOSE) {
+ System.out.println("actualQuery=" + actualQuery);
+ }
+ MultiTermQuery joinQuery;
+ if (from) {
+ joinQuery = JoinUtil.createJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher);
+ } else {
+ joinQuery = JoinUtil.createJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher);
+ }
+ if (VERBOSE) {
+ System.out.println("joinQuery=" + joinQuery);
+ }
+
+ // Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector...
+ final FixedBitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc());
+ indexSearcher.search(joinQuery, new Collector() {
+
+ int docBase;
+
+ public void collect(int doc) throws IOException {
+ actualResult.set(doc + docBase);
+ }
+
+ public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+ docBase = context.docBase;
+ }
+
+ public void setScorer(Scorer scorer) throws IOException {
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+ });
+
+ if (VERBOSE) {
+ System.out.println("expected cardinality:" + expectedResult.cardinality());
+ DocIdSetIterator iterator = expectedResult.iterator();
+ for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
+ System.out.println(String.format("Expected doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id")));
+ }
+ System.out.println("actual cardinality:" + actualResult.cardinality());
+ iterator = actualResult.iterator();
+ for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
+ System.out.println(String.format("Actual doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id")));
+ }
+ }
+
+ assertEquals(expectedResult, actualResult);
+ }
+ topLevelReader.close();
+ dir.close();
+ }
+ }
+
+ private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException {
+ return createContext(nDocs, writer, writer, multipleValuesPerDocument);
+ }
+
+ private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException {
+ IndexIterationContext context = new IndexIterationContext();
+ int numRandomValues = nDocs / 2;
+ context.randomUniqueValues = new String[numRandomValues];
+ Set