Index: modules/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java =================================================================== --- modules/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java (revision 1229659) +++ modules/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -26,6 +27,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.RandomIndexWriter; @@ -34,10 +36,9 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.grouping.GroupDocs; import org.apache.lucene.search.grouping.TopGroups; -import org.apache.lucene.search.join.BlockJoinCollector; -import org.apache.lucene.search.join.BlockJoinQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util._TestUtil; @@ -57,7 +58,7 @@ private Document makeJob(String skill, int year) { Document job = new Document(); job.add(newField("skill", skill, StringField.TYPE_STORED)); - job.add(new NumericField("year").setIntValue(year)); + job.add(new NumericField("year", NumericField.TYPE_STORED).setIntValue(year)); return job; } @@ -104,14 +105,14 @@ // Wrap the child document query to 'join' any matches // up to corresponding parent: - BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg); // Combine the parent and nested child queries into a single query for a candidate BooleanQuery fullQuery = new BooleanQuery(); fullQuery.add(new BooleanClause(parentQuery, Occur.MUST)); fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST)); - BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false); + ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 1, true, false); s.search(fullQuery, c); @@ -131,9 +132,34 @@ Document parentDoc = s.doc(group.groupValue); assertEquals("Lisa", parentDoc.get("name")); + + //System.out.println("TEST: now test up"); + + // Now join "up" (map parent hits to child docs) instead...: + ToChildBlockJoinQuery parentJoinQuery = new ToChildBlockJoinQuery(parentQuery, parentsFilter, random.nextBoolean()); + BooleanQuery fullChildQuery = new BooleanQuery(); + fullChildQuery.add(new BooleanClause(parentJoinQuery, Occur.MUST)); + fullChildQuery.add(new BooleanClause(childQuery, Occur.MUST)); + + //System.out.println("FULL: " + fullChildQuery); + TopDocs hits = s.search(fullChildQuery, 10); + assertEquals(1, hits.totalHits); + childDoc = s.doc(hits.scoreDocs[0].doc); + //System.out.println("CHILD = " + childDoc + " docID=" + hits.scoreDocs[0].doc); + assertEquals("java", childDoc.get("skill")); + assertEquals(2007, ((NumericField) childDoc.getField("year")).numericValue()); + assertEquals("Lisa", getParentDoc(r, parentsFilter, hits.scoreDocs[0].doc).get("name")); r.close(); dir.close(); } + + private Document getParentDoc(IndexReader reader, Filter parents, int childDocID) throws IOException { + final AtomicReaderContext[] leaves = ReaderUtil.leaves(reader.getTopReaderContext()); + final int subIndex = ReaderUtil.subIndex(childDocID, leaves); + final AtomicReaderContext leaf = leaves[subIndex]; + final FixedBitSet bits = (FixedBitSet) parents.getDocIdSet(leaf, null); + return leaf.reader.document(bits.nextSetBit(childDocID - leaf.docBase)); + } public void testBoostBug() throws Exception { final Directory dir = newDirectory(); @@ -142,7 +168,7 @@ w.close(); IndexSearcher s = newSearcher(r); - BlockJoinQuery q = new BlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), BlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), ToParentBlockJoinQuery.ScoreMode.Avg); s.search(q, 10); BooleanQuery bq = new BooleanQuery(); bq.setBoost(2f); // we boost the BQ @@ -199,8 +225,9 @@ public void testRandom() throws Exception { // We build two indices at once: one normalized (which - // BlockJoinQuery/Collector can query) and the other w/ - // same docs just fully denormalized: + // ToParentBlockJoinQuery/Collector, + // ToChildBlockJoinQuery can query) and the other w/ + // the same docs, just fully denormalized: final Directory dir = newDirectory(); final Directory joinDir = newDirectory(); @@ -212,7 +239,7 @@ // Values for child fields: final String[][] childFields = getRandomFields(numParentDocs); - // TODO: test star join, nested join cases too! + // TODO: parallel star join, nested join cases too! final RandomIndexWriter w = new RandomIndexWriter(random, dir); final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir); for(int parentDocID=0;parentDocID joinDocs = new ArrayList(); if (VERBOSE) { - System.out.println(" " + parentDoc); + StringBuilder sb = new StringBuilder(); + sb.append("parentID=" + parentDoc.get("parentID")); + for(int fieldID=0;fieldID joinResults) throws Exception { // results is 'complete'; joinResults is a subset int resultUpto = 0; @@ -539,8 +743,8 @@ // Wrap the child document query to 'join' any matches // up to corresponding parent: - BlockJoinQuery childJobJoinQuery = new BlockJoinQuery(childJobQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); - BlockJoinQuery childQualificationJoinQuery = new BlockJoinQuery(childQualificationQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery childJobJoinQuery = new ToParentBlockJoinQuery(childJobQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery childQualificationJoinQuery = new ToParentBlockJoinQuery(childQualificationQuery, parentsFilter, ToParentBlockJoinQuery.ScoreMode.Avg); // Combine the parent and nested child queries into a single query for a candidate BooleanQuery fullQuery = new BooleanQuery(); @@ -548,12 +752,13 @@ fullQuery.add(new BooleanClause(childJobJoinQuery, Occur.MUST)); fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST)); - //????? How do I control volume of jobs vs qualifications per parent? - BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 10, true, false); + // Collects all job and qualification child docs for + // each resume hit in the top N (sorted by score): + ToParentBlockJoinCollector c = new ToParentBlockJoinCollector(Sort.RELEVANCE, 10, true, false); s.search(fullQuery, c); - //Examine "Job" children + // Examine "Job" children boolean showNullPointerIssue=true; if (showNullPointerIssue) { TopGroups jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true); @@ -573,10 +778,9 @@ assertEquals("Lisa", parentDoc.get("name")); } - //Now Examine qualification children + // Now Examine qualification children TopGroups qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true); - //!!!!! This next line can null pointer - but only if prior "jobs" section called first assertEquals(1, qualificationResults.totalGroupedHitCount); assertEquals(1, qualificationResults.groups.length); @@ -610,7 +814,7 @@ new QueryWrapperFilter( new TermQuery(new Term("parent", "1")))); - BlockJoinQuery q = new BlockJoinQuery(tq, parentFilter, BlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ToParentBlockJoinQuery.ScoreMode.Avg); Weight weight = s.createNormalizedWeight(q); DocIdSetIterator disi = weight.scorer(ReaderUtil.leaves(s.getIndexReader().getTopReaderContext())[0], true, true, null); assertEquals(1, disi.advance(1)); @@ -644,7 +848,7 @@ new QueryWrapperFilter( new TermQuery(new Term("isparent", "yes")))); - BlockJoinQuery q = new BlockJoinQuery(tq, parentFilter, BlockJoinQuery.ScoreMode.Avg); + ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ToParentBlockJoinQuery.ScoreMode.Avg); Weight weight = s.createNormalizedWeight(q); DocIdSetIterator disi = weight.scorer(ReaderUtil.leaves(s.getIndexReader().getTopReaderContext())[0], true, true, null); assertEquals(2, disi.advance(0)); Index: modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java (revision 1229659) +++ modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java (working copy) @@ -1,409 +0,0 @@ -package org.apache.lucene.search.join; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Set; - -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; // javadocs -import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Filter; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Scorer.ChildScorer; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.grouping.TopGroups; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.FixedBitSet; - -/** - * This query requires that you index - * children and parent docs as a single block, using the - * {@link IndexWriter#addDocuments} or {@link - * IndexWriter#updateDocuments} API. In each block, the - * child documents must appear first, ending with the parent - * document. At search time you provide a Filter - * identifying the parents, however this Filter must provide - * an {@link FixedBitSet} per sub-reader. - * - *

Once the block index is built, use this query to wrap - * any sub-query matching only child docs and join matches in that - * child document space up to the parent document space. - * You can then use this Query as a clause with - * other queries in the parent document space.

- * - *

The child documents must be orthogonal to the parent - * documents: the wrapped child query must never - * return a parent document.

- * - * If you'd like to retrieve {@link TopGroups} for the - * resulting query, use the {@link BlockJoinCollector}. - * Note that this is not necessary, ie, if you simply want - * to collect the parent documents and don't need to see - * which child documents matched under that parent, then - * you can use any collector. - * - *

NOTE: If the overall query contains parent-only - * matches, for example you OR a parent-only query with a - * joined child-only query, then the resulting collected documents - * will be correct, however the {@link TopGroups} you get - * from {@link BlockJoinCollector} will not contain every - * child for parents that had matched. - * - *

See {@link org.apache.lucene.search.join} for an - * overview.

- * - * @lucene.experimental - */ - -public class BlockJoinQuery extends Query { - - public static enum ScoreMode {None, Avg, Max, Total}; - - private final Filter parentsFilter; - private final Query childQuery; - - // If we are rewritten, this is the original childQuery we - // were passed; we use this for .equals() and - // .hashCode(). This makes rewritten query equal the - // original, so that user does not have to .rewrite() their - // query before searching: - private final Query origChildQuery; - private final ScoreMode scoreMode; - - public BlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { - super(); - this.origChildQuery = childQuery; - this.childQuery = childQuery; - this.parentsFilter = parentsFilter; - this.scoreMode = scoreMode; - } - - private BlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { - super(); - this.origChildQuery = origChildQuery; - this.childQuery = childQuery; - this.parentsFilter = parentsFilter; - this.scoreMode = scoreMode; - } - - @Override - public Weight createWeight(IndexSearcher searcher) throws IOException { - return new BlockJoinWeight(this, childQuery.createWeight(searcher), parentsFilter, scoreMode); - } - - private static class BlockJoinWeight extends Weight { - private final Query joinQuery; - private final Weight childWeight; - private final Filter parentsFilter; - private final ScoreMode scoreMode; - - public BlockJoinWeight(Query joinQuery, Weight childWeight, Filter parentsFilter, ScoreMode scoreMode) { - super(); - this.joinQuery = joinQuery; - this.childWeight = childWeight; - this.parentsFilter = parentsFilter; - this.scoreMode = scoreMode; - } - - @Override - public Query getQuery() { - return joinQuery; - } - - @Override - public float getValueForNormalization() throws IOException { - return childWeight.getValueForNormalization() * joinQuery.getBoost() * joinQuery.getBoost(); - } - - @Override - public void normalize(float norm, float topLevelBoost) { - childWeight.normalize(norm, topLevelBoost * joinQuery.getBoost()); - } - - @Override - public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { - // Pass scoreDocsInOrder true, topScorer false to our sub: - final Scorer childScorer = childWeight.scorer(readerContext, true, false, acceptDocs); - - if (childScorer == null) { - // No matches - return null; - } - - final int firstChildDoc = childScorer.nextDoc(); - if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) { - // No matches - return null; - } - - final DocIdSet parents = parentsFilter.getDocIdSet(readerContext, readerContext.reader.getLiveDocs()); - // TODO: once we do random-access filters we can - // generalize this: - if (parents == null) { - // No matches - return null; - } - if (!(parents instanceof FixedBitSet)) { - throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents); - } - - return new BlockJoinScorer(this, childScorer, (FixedBitSet) parents, firstChildDoc, scoreMode); - } - - @Override - public Explanation explain(AtomicReaderContext reader, int doc) throws IOException { - // TODO - throw new UnsupportedOperationException(getClass().getName() + - " cannot explain match on parent document"); - } - - @Override - public boolean scoresDocsOutOfOrder() { - return false; - } - } - - static class BlockJoinScorer extends Scorer { - private final Scorer childScorer; - private final FixedBitSet parentBits; - private final ScoreMode scoreMode; - private int parentDoc = -1; - private float parentScore; - private int nextChildDoc; - - private int[] pendingChildDocs = new int[5]; - private float[] pendingChildScores; - private int childDocUpto; - - public BlockJoinScorer(Weight weight, Scorer childScorer, FixedBitSet parentBits, int firstChildDoc, ScoreMode scoreMode) { - super(weight); - //System.out.println("Q.init firstChildDoc=" + firstChildDoc); - this.parentBits = parentBits; - this.childScorer = childScorer; - this.scoreMode = scoreMode; - if (scoreMode != ScoreMode.None) { - pendingChildScores = new float[5]; - } - nextChildDoc = firstChildDoc; - } - - @Override - public Collection getChildren() { - return Collections.singletonList(new ChildScorer(childScorer, "BLOCK_JOIN")); - } - - int getChildCount() { - return childDocUpto; - } - - int[] swapChildDocs(int[] other) { - final int[] ret = pendingChildDocs; - if (other == null) { - pendingChildDocs = new int[5]; - } else { - pendingChildDocs = other; - } - return ret; - } - - float[] swapChildScores(float[] other) { - if (scoreMode == ScoreMode.None) { - throw new IllegalStateException("ScoreMode is None"); - } - final float[] ret = pendingChildScores; - if (other == null) { - pendingChildScores = new float[5]; - } else { - pendingChildScores = other; - } - return ret; - } - - @Override - public int nextDoc() throws IOException { - //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc); - - if (nextChildDoc == NO_MORE_DOCS) { - //System.out.println(" end"); - return parentDoc = NO_MORE_DOCS; - } - - // Gather all children sharing the same parent as nextChildDoc - parentDoc = parentBits.nextSetBit(nextChildDoc); - //System.out.println(" parentDoc=" + parentDoc); - assert parentDoc != -1; - - float totalScore = 0; - float maxScore = Float.NEGATIVE_INFINITY; - - childDocUpto = 0; - do { - //System.out.println(" c=" + nextChildDoc); - if (pendingChildDocs.length == childDocUpto) { - pendingChildDocs = ArrayUtil.grow(pendingChildDocs); - if (scoreMode != ScoreMode.None) { - pendingChildScores = ArrayUtil.grow(pendingChildScores); - } - } - pendingChildDocs[childDocUpto] = nextChildDoc; - if (scoreMode != ScoreMode.None) { - // TODO: specialize this into dedicated classes per-scoreMode - final float childScore = childScorer.score(); - pendingChildScores[childDocUpto] = childScore; - maxScore = Math.max(childScore, maxScore); - totalScore += childScore; - } - childDocUpto++; - nextChildDoc = childScorer.nextDoc(); - } while (nextChildDoc < parentDoc); - //System.out.println(" nextChildDoc=" + nextChildDoc); - - // Parent & child docs are supposed to be orthogonal: - assert nextChildDoc != parentDoc; - - switch(scoreMode) { - case Avg: - parentScore = totalScore / childDocUpto; - break; - case Max: - parentScore = maxScore; - break; - case Total: - parentScore = totalScore; - break; - case None: - break; - } - - //System.out.println(" return parentDoc=" + parentDoc); - return parentDoc; - } - - @Override - public int docID() { - return parentDoc; - } - - @Override - public float score() throws IOException { - return parentScore; - } - - @Override - public int advance(int parentTarget) throws IOException { - - //System.out.println("Q.advance parentTarget=" + parentTarget); - if (parentTarget == NO_MORE_DOCS) { - return parentDoc = NO_MORE_DOCS; - } - - if (parentTarget == 0) { - // Callers should only be passing in a docID from - // the parent space, so this means this parent - // has no children (it got docID 0), so it cannot - // possibly match. We must handle this case - // separately otherwise we pass invalid -1 to - // prevSetBit below: - return nextDoc(); - } - - final int prevParentDoc = parentBits.prevSetBit(parentTarget-1); - - //System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc); - assert prevParentDoc >= parentDoc; - if (prevParentDoc > nextChildDoc) { - nextChildDoc = childScorer.advance(prevParentDoc); - // System.out.println(" childScorer advanced to child docID=" + nextChildDoc); - //} else { - //System.out.println(" skip childScorer advance"); - } - - // Parent & child docs are supposed to be orthogonal: - assert nextChildDoc != prevParentDoc; - - final int nd = nextDoc(); - //System.out.println(" return nextParentDoc=" + nd); - return nd; - } - } - - @Override - public void extractTerms(Set terms) { - childQuery.extractTerms(terms); - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - final Query childRewrite = childQuery.rewrite(reader); - if (childRewrite != childQuery) { - Query rewritten = new BlockJoinQuery(childQuery, - childRewrite, - parentsFilter, - scoreMode); - rewritten.setBoost(getBoost()); - return rewritten; - } else { - return this; - } - } - - @Override - public String toString(String field) { - return "BlockJoinQuery ("+childQuery.toString()+")"; - } - - @Override - public boolean equals(Object _other) { - if (_other instanceof BlockJoinQuery) { - final BlockJoinQuery other = (BlockJoinQuery) _other; - return origChildQuery.equals(other.origChildQuery) && - parentsFilter.equals(other.parentsFilter) && - scoreMode == other.scoreMode; - } else { - return false; - } - } - - @Override - public int hashCode() { - final int prime = 31; - int hash = 1; - hash = prime * hash + origChildQuery.hashCode(); - hash = prime * hash + scoreMode.hashCode(); - hash = prime * hash + parentsFilter.hashCode(); - return hash; - } - - @Override - public Object clone() { - return new BlockJoinQuery((Query) origChildQuery.clone(), - parentsFilter, - scoreMode); - } -} Index: modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java (revision 1229659) +++ modules/join/src/java/org/apache/lucene/search/join/BlockJoinCollector.java (working copy) @@ -1,459 +0,0 @@ -package org.apache.lucene.search.join; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.Map; -import java.util.Queue; - -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; // javadocs -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.FieldComparator; -import org.apache.lucene.search.FieldValueHitQueue; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreCachingWrappingScorer; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Scorer.ChildScorer; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.TopDocsCollector; -import org.apache.lucene.search.TopFieldCollector; -import org.apache.lucene.search.TopScoreDocCollector; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.grouping.GroupDocs; -import org.apache.lucene.search.grouping.TopGroups; -import org.apache.lucene.util.ArrayUtil; - - -/** Collects parent document hits for a Query containing one more more - * BlockJoinQuery clauses, sorted by the - * specified parent Sort. Note that this cannot perform - * arbitrary joins; rather, it requires that all joined - * documents are indexed as a doc block (using {@link - * IndexWriter#addDocuments} or {@link - * IndexWriter#updateDocuments}). Ie, the join is computed - * at index time. - * - *

The parent Sort must only use - * fields from the parent documents; sorting by field in - * the child documents is not supported.

- * - *

You should only use this - * collector if one or more of the clauses in the query is - * a {@link BlockJoinQuery}. This collector will find those query - * clauses and record the matching child documents for the - * top scoring parent documents.

- * - *

Multiple joins (star join) and nested joins and a mix - * of the two are allowed, as long as in all cases the - * documents corresponding to a single row of each joined - * parent table were indexed as a doc block.

- * - *

For the simple star join you can retrieve the - * {@link TopGroups} instance containing each {@link BlockJoinQuery}'s - * matching child documents for the top parent groups, - * using {@link #getTopGroups}. Ie, - * a single query, which will contain two or more - * {@link BlockJoinQuery}'s as clauses representing the star join, - * can then retrieve two or more {@link TopGroups} instances.

- * - *

For nested joins, the query will run correctly (ie, - * match the right parent and child documents), however, - * because TopGroups is currently unable to support nesting - * (each group is not able to hold another TopGroups), you - * are only able to retrieve the TopGroups of the first - * join. The TopGroups of the nested joins will not be - * correct. - * - * See {@link org.apache.lucene.search.join} for a code - * sample. - * - * @lucene.experimental - */ -public class BlockJoinCollector extends Collector { - - private final Sort sort; - - // Maps each BlockJoinQuery instance to its "slot" in - // joinScorers and in OneGroup's cached doc/scores/count: - private final Map joinQueryID = new HashMap(); - private final int numParentHits; - private final FieldValueHitQueue queue; - private final FieldComparator[] comparators; - private final int[] reverseMul; - private final int compEnd; - private final boolean trackMaxScore; - private final boolean trackScores; - - private int docBase; - private BlockJoinQuery.BlockJoinScorer[] joinScorers = new BlockJoinQuery.BlockJoinScorer[0]; - private IndexReader.AtomicReaderContext currentReaderContext; - private Scorer scorer; - private boolean queueFull; - - private OneGroup bottom; - private int totalHitCount; - private float maxScore = Float.NaN; - - /* Creates a BlockJoinCollector. The provided sort must - * not be null. */ - public BlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException { - // TODO: allow null sort to be specialized to relevance - // only collector - this.sort = sort; - this.trackMaxScore = trackMaxScore; - this.trackScores = trackScores; - this.numParentHits = numParentHits; - queue = FieldValueHitQueue.create(sort.getSort(), numParentHits); - comparators = queue.getComparators(); - reverseMul = queue.getReverseMul(); - compEnd = comparators.length - 1; - } - - private static final class OneGroup extends FieldValueHitQueue.Entry { - public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) { - super(comparatorSlot, parentDoc, parentScore); - docs = new int[numJoins][]; - for(int joinID=0;joinID 0) { - // Definitely competitive. - break; - } else if (i == compEnd) { - // Here c=0. If we're at the last comparator, this doc is not - // competitive, since docs are visited in doc Id order, which means - // this doc cannot compete with any other document in the queue. - //System.out.println(" skip"); - return; - } - } - - //System.out.println(" competes! doc=" + (docBase + parentDoc)); - - // This hit is competitive - replace bottom element in queue & adjustTop - for (int i = 0; i < comparators.length; i++) { - comparators[i].copy(bottom.slot, parentDoc); - } - if (!trackMaxScore && trackScores) { - score = scorer.score(); - } - bottom.doc = docBase + parentDoc; - bottom.readerContext = currentReaderContext; - bottom.score = score; - copyGroups(bottom); - bottom = queue.updateTop(); - - for (int i = 0; i < comparators.length; i++) { - comparators[i].setBottom(bottom.slot); - } - } else { - // Startup transient: queue is not yet full: - final int comparatorSlot = totalHitCount - 1; - - // Copy hit into queue - for (int i = 0; i < comparators.length; i++) { - comparators[i].copy(comparatorSlot, parentDoc); - } - //System.out.println(" startup: new OG doc=" + (docBase+parentDoc)); - final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores); - og.readerContext = currentReaderContext; - copyGroups(og); - bottom = queue.add(og); - queueFull = totalHitCount == numParentHits; - if (queueFull) { - // End of startup transient: queue just filled up: - for (int i = 0; i < comparators.length; i++) { - comparators[i].setBottom(bottom.slot); - } - } - } - } - - // Pulls out child doc and scores for all join queries: - private void copyGroups(OneGroup og) { - // While rare, it's possible top arrays could be too - // short if join query had null scorer on first - // segment(s) but then became non-null on later segments - final int numSubScorers = joinScorers.length; - if (og.docs.length < numSubScorers) { - // While rare, this could happen if join query had - // null scorer on first segment(s) but then became - // non-null on later segments - og.docs = ArrayUtil.grow(og.docs); - } - if (og.counts.length < numSubScorers) { - og.counts = ArrayUtil.grow(og.counts); - } - if (trackScores && og.scores.length < numSubScorers) { - og.scores = ArrayUtil.grow(og.scores); - } - - //System.out.println("copyGroups parentDoc=" + og.doc); - for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) { - final BlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX]; - //System.out.println(" scorer=" + joinScorer); - if (joinScorer != null) { - og.counts[scorerIDX] = joinScorer.getChildCount(); - //System.out.println(" count=" + og.counts[scorerIDX]); - og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]); - /* - for(int idx=0;idx queue = new LinkedList(); - queue.add(scorer); - while ((scorer = queue.poll()) != null) { - if (scorer instanceof BlockJoinQuery.BlockJoinScorer) { - enroll((BlockJoinQuery) scorer.getWeight().getQuery(), (BlockJoinQuery.BlockJoinScorer)scorer); - } - - for (ChildScorer sub : scorer.getChildren()) { - queue.add(sub.child); - } - } - } - - private final static class FakeScorer extends Scorer { - - float score; - int doc; - - public FakeScorer() { - super((Weight) null); - } - - @Override - public float score() { - return score; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); - } - - @Override - public int nextDoc() { - throw new UnsupportedOperationException(); - } - } - - private OneGroup[] sortedGroups; - - private void sortQueue() { - sortedGroups = new OneGroup[queue.size()]; - for(int downTo=queue.size()-1;downTo>=0;downTo--) { - sortedGroups[downTo] = queue.pop(); - } - } - - /** Return the TopGroups for the specified - * BlockJoinQuery. The groupValue of each GroupDocs will - * be the parent docID for that group. Note that the - * {@link GroupDocs#totalHits}, which would be the - * total number of child documents matching that parent, - * is not computed (will always be 0). Returns null if - * no groups matched. */ - @SuppressWarnings("unchecked") - public TopGroups getTopGroups(BlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields) - - throws IOException { - - final Integer _slot = joinQueryID.get(query); - if (_slot == null) { - if (totalHitCount == 0) { - return null; - } else { - throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery"); - } - } - - // unbox once - final int slot = _slot; - - if (sortedGroups == null) { - if (offset >= queue.size()) { - return null; - } - sortQueue(); - } else if (offset > sortedGroups.length) { - return null; - } - - int totalGroupedHitCount = 0; - - final FakeScorer fakeScorer = new FakeScorer(); - - final GroupDocs[] groups = new GroupDocs[sortedGroups.length - offset]; - - for(int groupIDX=offset;groupIDX(topDocs.getMaxScore(), - og.counts[slot], - topDocs.scoreDocs, - og.doc, - groupSortValues); - } - - return new TopGroups(new TopGroups(sort.getSort(), - withinGroupSort == null ? null : withinGroupSort.getSort(), - 0, totalGroupedHitCount, groups), - totalHitCount); - } -} Index: modules/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java (revision 0) +++ modules/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java (working copy) @@ -0,0 +1,316 @@ +package org.apache.lucene.search.join; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Set; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Scorer.ChildScorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; + +/** + * Just like {@link ToParentBlockJoinQuery}, except this + * query joins in reverse: you provide a Query matching + * parent documents and it joins down to child + * documents. + * + * @lucene.experimental + */ + +public class ToChildBlockJoinQuery extends Query { + + private final Filter parentsFilter; + private final Query parentQuery; + + // If we are rewritten, this is the original parentQuery we + // were passed; we use this for .equals() and + // .hashCode(). This makes rewritten query equal the + // original, so that user does not have to .rewrite() their + // query before searching: + private final Query origParentQuery; + private final boolean doScores; + + public ToChildBlockJoinQuery(Query parentQuery, Filter parentsFilter, boolean doScores) { + super(); + this.origParentQuery = parentQuery; + this.parentQuery = parentQuery; + this.parentsFilter = parentsFilter; + this.doScores = doScores; + } + + private ToChildBlockJoinQuery(Query origParentQuery, Query parentQuery, Filter parentsFilter, boolean doScores) { + super(); + this.origParentQuery = origParentQuery; + this.parentQuery = parentQuery; + this.parentsFilter = parentsFilter; + this.doScores = doScores; + } + + @Override + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new ToChildBlockJoinWeight(this, parentQuery.createWeight(searcher), parentsFilter, doScores); + } + + private static class ToChildBlockJoinWeight extends Weight { + private final Query joinQuery; + private final Weight parentWeight; + private final Filter parentsFilter; + private final boolean doScores; + + public ToChildBlockJoinWeight(Query joinQuery, Weight parentWeight, Filter parentsFilter, boolean doScores) { + super(); + this.joinQuery = joinQuery; + this.parentWeight = parentWeight; + this.parentsFilter = parentsFilter; + this.doScores = doScores; + } + + @Override + public Query getQuery() { + return joinQuery; + } + + @Override + public float getValueForNormalization() throws IOException { + return parentWeight.getValueForNormalization() * joinQuery.getBoost() * joinQuery.getBoost(); + } + + @Override + public void normalize(float norm, float topLevelBoost) { + parentWeight.normalize(norm, topLevelBoost * joinQuery.getBoost()); + } + + @Override + public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder, + boolean topScorer, Bits acceptDocs) throws IOException { + // Pass scoreDocsInOrder true, topScorer false to our sub: + final Scorer parentScorer = parentWeight.scorer(readerContext, true, false, acceptDocs); + + if (parentScorer == null) { + // No matches + return null; + } + + final DocIdSet parents = parentsFilter.getDocIdSet(readerContext, readerContext.reader.getLiveDocs()); + // TODO: once we do random-access filters we can + // generalize this: + if (parents == null) { + // No matches + return null; + } + if (!(parents instanceof FixedBitSet)) { + throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents); + } + + return new ToChildBlockJoinScorer(this, parentScorer, (FixedBitSet) parents, doScores); + } + + @Override + public Explanation explain(AtomicReaderContext reader, int doc) throws IOException { + // TODO + throw new UnsupportedOperationException(getClass().getName() + + " cannot explain match on parent document"); + } + + @Override + public boolean scoresDocsOutOfOrder() { + return false; + } + } + + static class ToChildBlockJoinScorer extends Scorer { + private final Scorer parentScorer; + private final FixedBitSet parentBits; + private final boolean doScores; + private float parentScore; + + private int childDoc = -1; + private int parentDoc; + + public ToChildBlockJoinScorer(Weight weight, Scorer parentScorer, FixedBitSet parentBits, boolean doScores) { + super(weight); + this.doScores = doScores; + this.parentBits = parentBits; + this.parentScorer = parentScorer; + } + + @Override + public Collection getChildren() { + return Collections.singletonList(new ChildScorer(parentScorer, "BLOCK_JOIN")); + } + + @Override + public int nextDoc() throws IOException { + //System.out.println("Q.nextDoc() parentDoc=" + parentDoc + " childDoc=" + childDoc); + + if (childDoc+1 == parentDoc) { + // OK, we are done iterating through all children + // matching this one parent doc, so we now nextDoc() + // the parent. Use a while loop because we may have + // to skip over some number of parents w/ no + // children: + while (true) { + parentDoc = parentScorer.nextDoc(); + if (parentDoc == 0) { + // Degenerate but allowed: parent has no children + // TODO: would be nice to pull initial parent + // into ctor so we can skip this if... but it's + // tricky because scorer must return -1 for + // .doc() on init... + parentDoc = parentScorer.nextDoc(); + } + + if (parentDoc == NO_MORE_DOCS) { + childDoc = NO_MORE_DOCS; + //System.out.println(" END"); + return childDoc; + } + + childDoc = 1 + parentBits.prevSetBit(parentDoc-1); + if (childDoc < parentDoc) { + if (doScores) { + parentScore = parentScorer.score(); + } + //System.out.println(" " + childDoc); + return childDoc; + } else { + // Degenerate but allowed: parent has no children + } + } + } else { + assert childDoc < parentDoc: "childDoc=" + childDoc + " parentDoc=" + parentDoc; + childDoc++; + //System.out.println(" " + childDoc); + return childDoc; + } + } + + @Override + public int docID() { + return childDoc; + } + + @Override + public float score() throws IOException { + return parentScore; + } + + @Override + public int advance(int childTarget) throws IOException { + + //System.out.println("Q.advance childTarget=" + childTarget); + if (childTarget == NO_MORE_DOCS) { + //System.out.println(" END"); + return childDoc = parentDoc = NO_MORE_DOCS; + } + + assert childTarget != parentDoc; + if (childTarget > parentDoc) { + // Advance to new parent: + parentDoc = parentScorer.advance(childTarget); + //System.out.println(" advance to parentDoc=" + parentDoc); + assert parentDoc > childTarget; + if (parentDoc == NO_MORE_DOCS) { + //System.out.println(" END"); + return childDoc = NO_MORE_DOCS; + } + if (doScores) { + parentScore = parentScorer.score(); + } + final int firstChild = parentBits.prevSetBit(parentDoc-1); + //System.out.println(" firstChild=" + firstChild); + childTarget = Math.max(childTarget, firstChild); + } + + assert childTarget < parentDoc; + + // Advance within children of current parent: + childDoc = childTarget; + //System.out.println(" " + childDoc); + return childDoc; + } + } + + @Override + public void extractTerms(Set terms) { + parentQuery.extractTerms(terms); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + final Query parentRewrite = parentQuery.rewrite(reader); + if (parentRewrite != parentQuery) { + Query rewritten = new ToChildBlockJoinQuery(parentQuery, + parentRewrite, + parentsFilter, + doScores); + rewritten.setBoost(getBoost()); + return rewritten; + } else { + return this; + } + } + + @Override + public String toString(String field) { + return "ToChildBlockJoinQuery ("+parentQuery.toString()+")"; + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof ToChildBlockJoinQuery) { + final ToChildBlockJoinQuery other = (ToChildBlockJoinQuery) _other; + return origParentQuery.equals(other.origParentQuery) && + parentsFilter.equals(other.parentsFilter) && + doScores == other.doScores; + } else { + return false; + } + } + + @Override + public int hashCode() { + final int prime = 31; + int hash = 1; + hash = prime * hash + origParentQuery.hashCode(); + hash = prime * hash + new Boolean(doScores).hashCode(); + hash = prime * hash + parentsFilter.hashCode(); + return hash; + } + + @Override + public Object clone() { + return new ToChildBlockJoinQuery((Query) origParentQuery.clone(), + parentsFilter, + doScores); + } +} Property changes on: modules/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java (revision 0) +++ modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java (working copy) @@ -0,0 +1,433 @@ +package org.apache.lucene.search.join; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Set; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Scorer.ChildScorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; + +/** + * This query requires that you index + * children and parent docs as a single block, using the + * {@link IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments} API. In each block, the + * child documents must appear first, ending with the parent + * document. At search time you provide a Filter + * identifying the parents, however this Filter must provide + * an {@link FixedBitSet} per sub-reader. + * + *

Once the block index is built, use this query to wrap + * any sub-query matching only child docs and join matches in that + * child document space up to the parent document space. + * You can then use this Query as a clause with + * other queries in the parent document space.

+ * + *

See {@link ToChildBlockJoinQuery} if you need to join + * in the reverse order. + * + *

The child documents must be orthogonal to the parent + * documents: the wrapped child query must never + * return a parent document.

+ * + * If you'd like to retrieve {@link TopGroups} for the + * resulting query, use the {@link ToParentBlockJoinCollector}. + * Note that this is not necessary, ie, if you simply want + * to collect the parent documents and don't need to see + * which child documents matched under that parent, then + * you can use any collector. + * + *

NOTE: If the overall query contains parent-only + * matches, for example you OR a parent-only query with a + * joined child-only query, then the resulting collected documents + * will be correct, however the {@link TopGroups} you get + * from {@link ToParentBlockJoinCollector} will not contain every + * child for parents that had matched. + * + *

See {@link org.apache.lucene.search.join} for an + * overview.

+ * + * @lucene.experimental + */ + +public class ToParentBlockJoinQuery extends Query { + + /** How to aggregate multiple child hit scores into a + * single parent score. */ + public static enum ScoreMode { + /** Do no scoring. */ + None, + /** Parent hit's score is the average of all child + scores. */ + Avg, + /** Parent hit's score is the max of all child + scores. */ + Max, + /** Parent hit's score is the sum of all child + scores. */ + Total}; + + private final Filter parentsFilter; + private final Query childQuery; + + // If we are rewritten, this is the original childQuery we + // were passed; we use this for .equals() and + // .hashCode(). This makes rewritten query equal the + // original, so that user does not have to .rewrite() their + // query before searching: + private final Query origChildQuery; + private final ScoreMode scoreMode; + + /** Create a ToParentBlockJoinQuery. + * + * @param childQuery Query matching child documents. + * @param parentsFilter Filter (must produce FixedBitSet + * per-seegment) identifying the parent documents. + * @param scoreMode How to aggregate multiple child scores + * into a single parent score. + **/ + public ToParentBlockJoinQuery(Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.origChildQuery = childQuery; + this.childQuery = childQuery; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + private ToParentBlockJoinQuery(Query origChildQuery, Query childQuery, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.origChildQuery = origChildQuery; + this.childQuery = childQuery; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + @Override + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new BlockJoinWeight(this, childQuery.createWeight(searcher), parentsFilter, scoreMode); + } + + private static class BlockJoinWeight extends Weight { + private final Query joinQuery; + private final Weight childWeight; + private final Filter parentsFilter; + private final ScoreMode scoreMode; + + public BlockJoinWeight(Query joinQuery, Weight childWeight, Filter parentsFilter, ScoreMode scoreMode) { + super(); + this.joinQuery = joinQuery; + this.childWeight = childWeight; + this.parentsFilter = parentsFilter; + this.scoreMode = scoreMode; + } + + @Override + public Query getQuery() { + return joinQuery; + } + + @Override + public float getValueForNormalization() throws IOException { + return childWeight.getValueForNormalization() * joinQuery.getBoost() * joinQuery.getBoost(); + } + + @Override + public void normalize(float norm, float topLevelBoost) { + childWeight.normalize(norm, topLevelBoost * joinQuery.getBoost()); + } + + @Override + public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder, + boolean topScorer, Bits acceptDocs) throws IOException { + // Pass scoreDocsInOrder true, topScorer false to our sub: + final Scorer childScorer = childWeight.scorer(readerContext, true, false, acceptDocs); + + if (childScorer == null) { + // No matches + return null; + } + + final int firstChildDoc = childScorer.nextDoc(); + if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) { + // No matches + return null; + } + + final DocIdSet parents = parentsFilter.getDocIdSet(readerContext, readerContext.reader.getLiveDocs()); + // TODO: once we do random-access filters we can + // generalize this: + if (parents == null) { + // No matches + return null; + } + if (!(parents instanceof FixedBitSet)) { + throw new IllegalStateException("parentFilter must return FixedBitSet; got " + parents); + } + + return new BlockJoinScorer(this, childScorer, (FixedBitSet) parents, firstChildDoc, scoreMode); + } + + @Override + public Explanation explain(AtomicReaderContext reader, int doc) throws IOException { + // TODO + throw new UnsupportedOperationException(getClass().getName() + + " cannot explain match on parent document"); + } + + @Override + public boolean scoresDocsOutOfOrder() { + return false; + } + } + + static class BlockJoinScorer extends Scorer { + private final Scorer childScorer; + private final FixedBitSet parentBits; + private final ScoreMode scoreMode; + private int parentDoc = -1; + private float parentScore; + private int nextChildDoc; + + private int[] pendingChildDocs = new int[5]; + private float[] pendingChildScores; + private int childDocUpto; + + public BlockJoinScorer(Weight weight, Scorer childScorer, FixedBitSet parentBits, int firstChildDoc, ScoreMode scoreMode) { + super(weight); + //System.out.println("Q.init firstChildDoc=" + firstChildDoc); + this.parentBits = parentBits; + this.childScorer = childScorer; + this.scoreMode = scoreMode; + if (scoreMode != ScoreMode.None) { + pendingChildScores = new float[5]; + } + nextChildDoc = firstChildDoc; + } + + @Override + public Collection getChildren() { + return Collections.singletonList(new ChildScorer(childScorer, "BLOCK_JOIN")); + } + + int getChildCount() { + return childDocUpto; + } + + int[] swapChildDocs(int[] other) { + final int[] ret = pendingChildDocs; + if (other == null) { + pendingChildDocs = new int[5]; + } else { + pendingChildDocs = other; + } + return ret; + } + + float[] swapChildScores(float[] other) { + if (scoreMode == ScoreMode.None) { + throw new IllegalStateException("ScoreMode is None"); + } + final float[] ret = pendingChildScores; + if (other == null) { + pendingChildScores = new float[5]; + } else { + pendingChildScores = other; + } + return ret; + } + + @Override + public int nextDoc() throws IOException { + //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc); + + if (nextChildDoc == NO_MORE_DOCS) { + //System.out.println(" end"); + return parentDoc = NO_MORE_DOCS; + } + + // Gather all children sharing the same parent as nextChildDoc + parentDoc = parentBits.nextSetBit(nextChildDoc); + //System.out.println(" parentDoc=" + parentDoc); + assert parentDoc != -1; + + float totalScore = 0; + float maxScore = Float.NEGATIVE_INFINITY; + + childDocUpto = 0; + do { + //System.out.println(" c=" + nextChildDoc); + if (pendingChildDocs.length == childDocUpto) { + pendingChildDocs = ArrayUtil.grow(pendingChildDocs); + } + if (scoreMode != ScoreMode.None && pendingChildScores.length == childDocUpto) { + pendingChildScores = ArrayUtil.grow(pendingChildScores); + } + pendingChildDocs[childDocUpto] = nextChildDoc; + if (scoreMode != ScoreMode.None) { + // TODO: specialize this into dedicated classes per-scoreMode + final float childScore = childScorer.score(); + pendingChildScores[childDocUpto] = childScore; + maxScore = Math.max(childScore, maxScore); + totalScore += childScore; + } + childDocUpto++; + nextChildDoc = childScorer.nextDoc(); + } while (nextChildDoc < parentDoc); + //System.out.println(" nextChildDoc=" + nextChildDoc); + + // Parent & child docs are supposed to be orthogonal: + assert nextChildDoc != parentDoc; + + switch(scoreMode) { + case Avg: + parentScore = totalScore / childDocUpto; + break; + case Max: + parentScore = maxScore; + break; + case Total: + parentScore = totalScore; + break; + case None: + break; + } + + //System.out.println(" return parentDoc=" + parentDoc); + return parentDoc; + } + + @Override + public int docID() { + return parentDoc; + } + + @Override + public float score() throws IOException { + return parentScore; + } + + @Override + public int advance(int parentTarget) throws IOException { + + //System.out.println("Q.advance parentTarget=" + parentTarget); + if (parentTarget == NO_MORE_DOCS) { + return parentDoc = NO_MORE_DOCS; + } + + if (parentTarget == 0) { + // Callers should only be passing in a docID from + // the parent space, so this means this parent + // has no children (it got docID 0), so it cannot + // possibly match. We must handle this case + // separately otherwise we pass invalid -1 to + // prevSetBit below: + return nextDoc(); + } + + final int prevParentDoc = parentBits.prevSetBit(parentTarget-1); + + //System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc); + assert prevParentDoc >= parentDoc; + if (prevParentDoc > nextChildDoc) { + nextChildDoc = childScorer.advance(prevParentDoc); + // System.out.println(" childScorer advanced to child docID=" + nextChildDoc); + //} else { + //System.out.println(" skip childScorer advance"); + } + + // Parent & child docs are supposed to be orthogonal: + assert nextChildDoc != prevParentDoc; + + final int nd = nextDoc(); + //System.out.println(" return nextParentDoc=" + nd); + return nd; + } + } + + @Override + public void extractTerms(Set terms) { + childQuery.extractTerms(terms); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + final Query childRewrite = childQuery.rewrite(reader); + if (childRewrite != childQuery) { + Query rewritten = new ToParentBlockJoinQuery(childQuery, + childRewrite, + parentsFilter, + scoreMode); + rewritten.setBoost(getBoost()); + return rewritten; + } else { + return this; + } + } + + @Override + public String toString(String field) { + return "ToParentBlockJoinQuery ("+childQuery.toString()+")"; + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof ToParentBlockJoinQuery) { + final ToParentBlockJoinQuery other = (ToParentBlockJoinQuery) _other; + return origChildQuery.equals(other.origChildQuery) && + parentsFilter.equals(other.parentsFilter) && + scoreMode == other.scoreMode; + } else { + return false; + } + } + + @Override + public int hashCode() { + final int prime = 31; + int hash = 1; + hash = prime * hash + origChildQuery.hashCode(); + hash = prime * hash + scoreMode.hashCode(); + hash = prime * hash + parentsFilter.hashCode(); + return hash; + } + + @Override + public Object clone() { + return new ToParentBlockJoinQuery((Query) origChildQuery.clone(), + parentsFilter, + scoreMode); + } +} Property changes on: modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java (revision 0) +++ modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java (working copy) @@ -0,0 +1,459 @@ +package org.apache.lucene.search.join; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Queue; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.FieldValueHitQueue; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreCachingWrappingScorer; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Scorer.ChildScorer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.util.ArrayUtil; + + +/** Collects parent document hits for a Query containing one more more + * BlockJoinQuery clauses, sorted by the + * specified parent Sort. Note that this cannot perform + * arbitrary joins; rather, it requires that all joined + * documents are indexed as a doc block (using {@link + * IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments}). Ie, the join is computed + * at index time. + * + *

The parent Sort must only use + * fields from the parent documents; sorting by field in + * the child documents is not supported.

+ * + *

You should only use this + * collector if one or more of the clauses in the query is + * a {@link ToParentBlockJoinQuery}. This collector will find those query + * clauses and record the matching child documents for the + * top scoring parent documents.

+ * + *

Multiple joins (star join) and nested joins and a mix + * of the two are allowed, as long as in all cases the + * documents corresponding to a single row of each joined + * parent table were indexed as a doc block.

+ * + *

For the simple star join you can retrieve the + * {@link TopGroups} instance containing each {@link ToParentBlockJoinQuery}'s + * matching child documents for the top parent groups, + * using {@link #getTopGroups}. Ie, + * a single query, which will contain two or more + * {@link ToParentBlockJoinQuery}'s as clauses representing the star join, + * can then retrieve two or more {@link TopGroups} instances.

+ * + *

For nested joins, the query will run correctly (ie, + * match the right parent and child documents), however, + * because TopGroups is currently unable to support nesting + * (each group is not able to hold another TopGroups), you + * are only able to retrieve the TopGroups of the first + * join. The TopGroups of the nested joins will not be + * correct. + * + * See {@link org.apache.lucene.search.join} for a code + * sample. + * + * @lucene.experimental + */ +public class ToParentBlockJoinCollector extends Collector { + + private final Sort sort; + + // Maps each BlockJoinQuery instance to its "slot" in + // joinScorers and in OneGroup's cached doc/scores/count: + private final Map joinQueryID = new HashMap(); + private final int numParentHits; + private final FieldValueHitQueue queue; + private final FieldComparator[] comparators; + private final int[] reverseMul; + private final int compEnd; + private final boolean trackMaxScore; + private final boolean trackScores; + + private int docBase; + private ToParentBlockJoinQuery.BlockJoinScorer[] joinScorers = new ToParentBlockJoinQuery.BlockJoinScorer[0]; + private IndexReader.AtomicReaderContext currentReaderContext; + private Scorer scorer; + private boolean queueFull; + + private OneGroup bottom; + private int totalHitCount; + private float maxScore = Float.NaN; + + /* Creates a ToParentBlockJoinCollector. The provided sort must + * not be null. */ + public ToParentBlockJoinCollector(Sort sort, int numParentHits, boolean trackScores, boolean trackMaxScore) throws IOException { + // TODO: allow null sort to be specialized to relevance + // only collector + this.sort = sort; + this.trackMaxScore = trackMaxScore; + this.trackScores = trackScores; + this.numParentHits = numParentHits; + queue = FieldValueHitQueue.create(sort.getSort(), numParentHits); + comparators = queue.getComparators(); + reverseMul = queue.getReverseMul(); + compEnd = comparators.length - 1; + } + + private static final class OneGroup extends FieldValueHitQueue.Entry { + public OneGroup(int comparatorSlot, int parentDoc, float parentScore, int numJoins, boolean doScores) { + super(comparatorSlot, parentDoc, parentScore); + docs = new int[numJoins][]; + for(int joinID=0;joinID 0) { + // Definitely competitive. + break; + } else if (i == compEnd) { + // Here c=0. If we're at the last comparator, this doc is not + // competitive, since docs are visited in doc Id order, which means + // this doc cannot compete with any other document in the queue. + //System.out.println(" skip"); + return; + } + } + + //System.out.println(" competes! doc=" + (docBase + parentDoc)); + + // This hit is competitive - replace bottom element in queue & adjustTop + for (int i = 0; i < comparators.length; i++) { + comparators[i].copy(bottom.slot, parentDoc); + } + if (!trackMaxScore && trackScores) { + score = scorer.score(); + } + bottom.doc = docBase + parentDoc; + bottom.readerContext = currentReaderContext; + bottom.score = score; + copyGroups(bottom); + bottom = queue.updateTop(); + + for (int i = 0; i < comparators.length; i++) { + comparators[i].setBottom(bottom.slot); + } + } else { + // Startup transient: queue is not yet full: + final int comparatorSlot = totalHitCount - 1; + + // Copy hit into queue + for (int i = 0; i < comparators.length; i++) { + comparators[i].copy(comparatorSlot, parentDoc); + } + //System.out.println(" startup: new OG doc=" + (docBase+parentDoc)); + final OneGroup og = new OneGroup(comparatorSlot, docBase+parentDoc, score, joinScorers.length, trackScores); + og.readerContext = currentReaderContext; + copyGroups(og); + bottom = queue.add(og); + queueFull = totalHitCount == numParentHits; + if (queueFull) { + // End of startup transient: queue just filled up: + for (int i = 0; i < comparators.length; i++) { + comparators[i].setBottom(bottom.slot); + } + } + } + } + + // Pulls out child doc and scores for all join queries: + private void copyGroups(OneGroup og) { + // While rare, it's possible top arrays could be too + // short if join query had null scorer on first + // segment(s) but then became non-null on later segments + final int numSubScorers = joinScorers.length; + if (og.docs.length < numSubScorers) { + // While rare, this could happen if join query had + // null scorer on first segment(s) but then became + // non-null on later segments + og.docs = ArrayUtil.grow(og.docs); + } + if (og.counts.length < numSubScorers) { + og.counts = ArrayUtil.grow(og.counts); + } + if (trackScores && og.scores.length < numSubScorers) { + og.scores = ArrayUtil.grow(og.scores); + } + + //System.out.println("copyGroups parentDoc=" + og.doc); + for(int scorerIDX = 0;scorerIDX < numSubScorers;scorerIDX++) { + final ToParentBlockJoinQuery.BlockJoinScorer joinScorer = joinScorers[scorerIDX]; + //System.out.println(" scorer=" + joinScorer); + if (joinScorer != null) { + og.counts[scorerIDX] = joinScorer.getChildCount(); + //System.out.println(" count=" + og.counts[scorerIDX]); + og.docs[scorerIDX] = joinScorer.swapChildDocs(og.docs[scorerIDX]); + /* + for(int idx=0;idx queue = new LinkedList(); + queue.add(scorer); + while ((scorer = queue.poll()) != null) { + if (scorer instanceof ToParentBlockJoinQuery.BlockJoinScorer) { + enroll((ToParentBlockJoinQuery) scorer.getWeight().getQuery(), (ToParentBlockJoinQuery.BlockJoinScorer) scorer); + } + + for (ChildScorer sub : scorer.getChildren()) { + queue.add(sub.child); + } + } + } + + private final static class FakeScorer extends Scorer { + + float score; + int doc; + + public FakeScorer() { + super((Weight) null); + } + + @Override + public float score() { + return score; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + } + + private OneGroup[] sortedGroups; + + private void sortQueue() { + sortedGroups = new OneGroup[queue.size()]; + for(int downTo=queue.size()-1;downTo>=0;downTo--) { + sortedGroups[downTo] = queue.pop(); + } + } + + /** Return the TopGroups for the specified + * BlockJoinQuery. The groupValue of each GroupDocs will + * be the parent docID for that group. Note that the + * {@link GroupDocs#totalHits}, which would be the + * total number of child documents matching that parent, + * is not computed (will always be 0). Returns null if + * no groups matched. */ + @SuppressWarnings("unchecked") + public TopGroups getTopGroups(ToParentBlockJoinQuery query, Sort withinGroupSort, int offset, int maxDocsPerGroup, int withinGroupOffset, boolean fillSortFields) + + throws IOException { + + final Integer _slot = joinQueryID.get(query); + if (_slot == null) { + if (totalHitCount == 0) { + return null; + } else { + throw new IllegalArgumentException("the Query did not contain the provided BlockJoinQuery"); + } + } + + // unbox once + final int slot = _slot; + + if (sortedGroups == null) { + if (offset >= queue.size()) { + return null; + } + sortQueue(); + } else if (offset > sortedGroups.length) { + return null; + } + + int totalGroupedHitCount = 0; + + final FakeScorer fakeScorer = new FakeScorer(); + + final GroupDocs[] groups = new GroupDocs[sortedGroups.length - offset]; + + for(int groupIDX=offset;groupIDX(topDocs.getMaxScore(), + og.counts[slot], + topDocs.scoreDocs, + og.doc, + groupSortValues); + } + + return new TopGroups(new TopGroups(sort.getSort(), + withinGroupSort == null ? null : withinGroupSort.getSort(), + 0, totalGroupedHitCount, groups), + totalHitCount); + } +} Property changes on: modules/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: modules/join/src/java/org/apache/lucene/search/join/package.html =================================================================== --- modules/join/src/java/org/apache/lucene/search/join/package.html (revision 1229659) +++ modules/join/src/java/org/apache/lucene/search/join/package.html (working copy) @@ -14,19 +14,25 @@ parent documents, as Lucene does not currently record any information about doc blocks.

-

At search time, use {@link org.apache.lucene.search.join.BlockJoinQuery} to remap - matches from any child {@link org.apache.lucene.search.Query} (ie, a query that matches only - child documents) up to the parent document space. The resulting - {@link org.apache.lucene.search.join.BlockJoinQuery} can then be used as a clause in any query that - matches parent documents.

+

At search time, use {@link + org.apache.lucene.search.join.ToParentBlockJoinQuery} to remap/join + matches from any child {@link org.apache.lucene.search.Query} (ie, a + query that matches only child documents) up to the parent document + space. The + resulting query can then be used as a clause in any query that + matches parent.

If you only care about the parent documents matching the query, you can use any collector to collect the parent hits, but if you'd also like to see which child documents match for each parent document, - use the {@link org.apache.lucene.search.join.BlockJoinCollector} to collect the hits. Once the + use the {@link org.apache.lucene.search.join.ToParentBlockJoinCollector} to collect the hits. Once the search is done, you retrieve a {@link org.apache.lucene.search.grouping.TopGroups} instance from the - {@link org.apache.lucene.search.join.BlockJoinCollector#getTopGroups} method.

+ {@link org.apache.lucene.search.join.ToParentBlockJoinCollector#getTopGroups} method.

+

To map/join in the opposite direction, use {@link + org.apache.lucene.search.join.ToChildBlockJoinQuery}. This wraps + any query matching parent documents, creating the joined query + matching only child documents. Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1229659) +++ lucene/contrib/CHANGES.txt (working copy) @@ -57,7 +57,7 @@ * LUCENE-3527: Add LuceneLevenshteinDistance, which computes string distance in a compatible way as DirectSpellChecker. This can be used to merge top-N results from more than one SpellChecker. (James Dyer via Robert Muir) - + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread @@ -111,6 +111,11 @@ * LUCENE-3634: IndexReader's static main method was moved to a new tool, CompoundFileExtractor, in contrib/misc. (Mike McCandless) +* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous + BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do + joins in both parent to child and child to parent directions. + (Mike McCandless) + API Changes * LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an