diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
index ce9f0fc..dca6c45 100644
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
@@ -17,13 +17,15 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.spell.Dictionary;
@@ -32,14 +34,24 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/**
+ *
* Dictionary with terms, weights and optionally payload information
- * taken from stored fields in a Lucene index.
- *
- * NOTE:
+ * taken from stored/indexed fields in a Lucene index.
+ *
+ * NOTE:
*
* -
- * The term, weight and (optionally) payload fields supplied
- * are required for ALL documents and has to be stored
+ * The term and (optionally) payload fields have to be
+ * stored
+ *
+ * -
+ * The weight field can be stored or can be a {@link NumericDocValues}.
+ * If the weight field is not defined, the value of the weight is
0
+ *
+ * -
+ * if any of the term or (optionally) payload fields supplied
+ * do not have a value for a document, then the document is
+ * rejected by the dictionary
*
*
*/
@@ -59,10 +71,7 @@ public class DocumentDictionary implements Dictionary {
* the corresponding terms.
*/
public DocumentDictionary(IndexReader reader, String field, String weightField) {
- this.reader = reader;
- this.field = field;
- this.weightField = weightField;
- this.payloadField = null;
+ this(reader, field, weightField, null);
}
/**
@@ -85,6 +94,13 @@ public class DocumentDictionary implements Dictionary {
/** Implements {@link InputIterator} from stored fields. */
protected class DocumentInputIterator implements InputIterator {
+ /** leaves of the reader */
+ protected final List leaves;
+ /** starting docIds of all the leaves */
+ protected final int[] starts;
+ /** current leave index */
+ protected int currentLeafIndex = 0;
+
private final int docCount;
private final Set relevantFields;
private final boolean hasPayloads;
@@ -92,7 +108,8 @@ public class DocumentDictionary implements Dictionary {
private int currentDocId = -1;
private long currentWeight;
private BytesRef currentPayload;
- private StoredDocument doc;
+ private NumericDocValues weightValues;
+
/**
* Creates an iterator over term, weight and payload fields from the lucene
@@ -102,9 +119,18 @@ public class DocumentDictionary implements Dictionary {
public DocumentInputIterator(boolean hasPayloads) throws IOException {
docCount = reader.maxDoc() - 1;
this.hasPayloads = hasPayloads;
+ this.leaves = reader.leaves();
+ if (leaves.size() == 0) {
+ throw new IllegalArgumentException("Reader has to have at least one leaf");
+ }
currentPayload = null;
liveDocs = MultiFields.getLiveDocs(reader);
this.relevantFields = getRelevantFields(new String [] {field, weightField, payloadField});
+ starts = new int[leaves.size() + 1];
+ for (int i = 0; i < leaves.size(); i++) {
+ starts[i] = leaves.get(i).docBase;
+ }
+ starts[leaves.size()] = reader.maxDoc();
}
@Override
@@ -120,28 +146,29 @@ public class DocumentDictionary implements Dictionary {
continue;
}
- doc = reader.document(currentDocId, relevantFields);
+ StoredDocument doc = reader.document(currentDocId, relevantFields);
+
+ BytesRef tempPayload = null;
+ BytesRef tempTerm = null;
if (hasPayloads) {
StorableField payload = doc.getField(payloadField);
- if (payload == null) {
- throw new IllegalArgumentException(payloadField + " does not exist");
- } else if (payload.binaryValue() == null) {
- throw new IllegalArgumentException(payloadField + " does not have binary value");
+ if (payload == null || (payload.binaryValue() == null && payload.stringValue() == null)) {
+ continue;
}
- currentPayload = payload.binaryValue();
+ tempPayload = (payload.binaryValue() != null) ? payload.binaryValue() : new BytesRef(payload.stringValue());
}
- currentWeight = getWeight(currentDocId);
-
StorableField fieldVal = doc.getField(field);
- if (fieldVal == null) {
- throw new IllegalArgumentException(field + " does not exist");
- } else if(fieldVal.stringValue() == null) {
- throw new IllegalArgumentException(field + " does not have string value");
+ if (fieldVal == null || (fieldVal.binaryValue() == null && fieldVal.stringValue() == null)) {
+ continue;
}
+ tempTerm = (fieldVal.stringValue() != null) ? new BytesRef(fieldVal.stringValue()) : fieldVal.binaryValue();
- return new BytesRef(fieldVal.stringValue());
+ currentPayload = tempPayload;
+ currentWeight = getWeight(doc, currentDocId);
+
+ return tempTerm;
}
return null;
}
@@ -156,15 +183,29 @@ public class DocumentDictionary implements Dictionary {
return hasPayloads;
}
- /** Return the suggestion weight for this document */
- protected long getWeight(int docId) {
+ /**
+ * Returns the value of the weightField for the current document.
+ * Retrieves the value for the weightField if its stored (using doc)
+ * or if its indexed as {@link NumericDocValues} (using docId) for the document.
+ * If no value is found, then the weight is 0.
+ */
+ protected long getWeight(StoredDocument doc, int docId) {
StorableField weight = doc.getField(weightField);
- if (weight == null) {
- throw new IllegalArgumentException(weightField + " does not exist");
- } else if (weight.numericValue() == null) {
- throw new IllegalArgumentException(weightField + " does not have numeric value");
+ if (weight != null) {
+ return (weight.numericValue() != null) ? weight.numericValue().longValue() : 0;
+ } else {
+ int subIndex = ReaderUtil.subIndex(docId, starts);
+ if (subIndex != currentLeafIndex || weightValues == null) {
+ currentLeafIndex = subIndex;
+ try {
+ AtomicReaderContext ctx = leaves.get(currentLeafIndex);
+ weightValues = ctx.reader().getNumericDocValues(weightField);
+ } catch (IOException e) {
+ throw new RuntimeException();
+ }
+ }
+ return (weightValues != null) ? weightValues.get(docId - starts[subIndex]) : 0;
}
- return weight.numericValue().longValue();
}
private Set getRelevantFields(String... fields) {
diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
index ea494e1..3f38315 100644
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
@@ -20,16 +20,15 @@ package org.apache.lucene.search.suggest;
import java.io.IOException;
import java.text.ParseException;
import java.util.HashMap;
-import java.util.List;
import java.util.Set;
import org.apache.lucene.document.NumericDocValuesField; // javadocs
import org.apache.lucene.expressions.Expression;
import org.apache.lucene.expressions.SimpleBindings;
import org.apache.lucene.expressions.js.JavascriptCompiler;
-import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.SortField;
@@ -37,23 +36,34 @@ import org.apache.lucene.util.BytesRefIterator;
/**
+ *
* Dictionary with terms and optionally payload information
* taken from stored fields in a Lucene index. Similar to
* {@link DocumentDictionary}, except it computes the weight
* of the terms in a document based on a user-defined expression
* having one or more {@link NumericDocValuesField} in the document.
- *
+ *
* NOTE:
*
* -
- * The term and (optionally) payload fields supplied
- * are required for ALL documents and has to be stored
+ * The term and (optionally) payload fields have to be
+ * stored
+ *
+ * -
+ * if the term or (optionally) payload fields supplied
+ * do not have a value for a document, then the document is
+ * rejected by the dictionary
+ *
+ * -
+ * All the fields used in
weightExpression should
+ * have values for all documents, if any of the fields do not
+ * have a value for a document, it will default to 0
*
*
*/
public class DocumentExpressionDictionary extends DocumentDictionary {
- private ValueSource weightsValueSource;
+ private final ValueSource weightsValueSource;
/**
* Creates a new dictionary with the contents of the fields named field
@@ -86,8 +96,31 @@ public class DocumentExpressionDictionary extends DocumentDictionary {
for (SortField sortField: sortFields) {
bindings.add(sortField);
}
- weightsValueSource = expression.getValueSource(bindings);
+ weightsValueSource = expression.getValueSource(bindings);
+ }
+
+ /**
+ * Creates a new dictionary with the contents of the fields named field
+ * for the terms, payloadField for the corresponding payloads
+ * and uses the weightsValueSource supplied to determine the
+ * score.
+ */
+ public DocumentExpressionDictionary(IndexReader reader, String field,
+ ValueSource weightsValueSource, String payload) {
+ super(reader, field, null, payload);
+ this.weightsValueSource = weightsValueSource;
+ }
+
+ /**
+ * Creates a new dictionary with the contents of the fields named field
+ * for the terms and uses the weightsValueSource supplied to determine the
+ * score.
+ */
+ public DocumentExpressionDictionary(IndexReader reader, String field,
+ ValueSource weightsValueSource) {
+ super(reader, field, null, null);
+ this.weightsValueSource = weightsValueSource;
}
@Override
@@ -98,30 +131,19 @@ public class DocumentExpressionDictionary extends DocumentDictionary {
final class DocumentExpressionInputIterator extends DocumentDictionary.DocumentInputIterator {
private FunctionValues currentWeightValues;
- private int currentLeafIndex = 0;
- private final List leaves;
-
- private final int[] starts;
public DocumentExpressionInputIterator(boolean hasPayloads)
throws IOException {
super(hasPayloads);
- leaves = reader.leaves();
- if (leaves.size() == 0) {
- throw new IllegalArgumentException("Reader has to have at least one leaf");
- }
- starts = new int[leaves.size() + 1];
- for (int i = 0; i < leaves.size(); i++) {
- starts[i] = leaves.get(i).docBase;
- }
- starts[leaves.size()] = reader.maxDoc();
-
- currentLeafIndex = 0;
currentWeightValues = weightsValueSource.getValues(new HashMap(), leaves.get(currentLeafIndex));
}
+ /**
+ * Returns the weight for the current docId as computed
+ * by the weightsValueSource
+ * */
@Override
- protected long getWeight(int docId) {
+ protected long getWeight(StoredDocument doc, int docId) {
int subIndex = ReaderUtil.subIndex(docId, starts);
if (subIndex != currentLeafIndex) {
currentLeafIndex = subIndex;
diff --git lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
index 85418ff..60e0ad8 100644
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
@@ -1,22 +1,24 @@
package org.apache.lucene.search.suggest;
import java.io.IOException;
+import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.suggest.DocumentDictionary;
@@ -48,19 +50,52 @@ public class DocumentDictionaryTest extends LuceneTestCase {
static final String WEIGHT_FIELD_NAME = "w1";
static final String PAYLOAD_FIELD_NAME = "p1";
- private Map generateIndexDocuments(int ndocs) {
+ /** Returns Pair(list of invalid document terms, Map of document term -> document) */
+ private Map.Entry, Map> generateIndexDocuments(int ndocs, boolean requiresPayload) {
Map docs = new HashMap<>();
+ List invalidDocTerms = new ArrayList<>();
for(int i = 0; i < ndocs ; i++) {
- Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
- Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
- Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i);
Document doc = new Document();
- doc.add(field);
- doc.add(payload);
- doc.add(weight);
- docs.put(field.stringValue(), doc);
+ boolean invalidDoc = false;
+ Field field = null;
+ // usually have valid term field in document
+ if (usually()) {
+ field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
+ doc.add(field);
+ } else {
+ invalidDoc = true;
+ }
+
+ // even if payload is not required usually have it
+ if (requiresPayload || usually()) {
+ // usually have valid payload field in document
+ if (usually()) {
+ Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
+ doc.add(payload);
+ } else if (requiresPayload) {
+ invalidDoc = true;
+ }
+ }
+
+ // usually have valid weight field in document
+ if (usually()) {
+ Field weight = (rarely()) ?
+ new StoredField(WEIGHT_FIELD_NAME, 100d + i) :
+ new NumericDocValuesField(WEIGHT_FIELD_NAME, 100 + i);
+ doc.add(weight);
+ }
+
+ String term = null;
+ if (invalidDoc) {
+ term = (field!=null) ? field.stringValue() : "invalid_" + i;
+ invalidDocTerms.add(term);
+ } else {
+ term = field.stringValue();
+ }
+
+ docs.put(term, doc);
}
- return docs;
+ return new SimpleEntry, Map>(invalidDocTerms, docs);
}
@Test
@@ -69,7 +104,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(10000), true);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
@@ -82,10 +119,16 @@ public class DocumentDictionaryTest extends LuceneTestCase {
while((f = tfp.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
- assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ Field weightField = doc.getField(WEIGHT_FIELD_NAME);
+ assertEquals(tfp.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertTrue(tfp.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}
@@ -96,7 +139,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(10000), false);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
@@ -109,10 +154,17 @@ public class DocumentDictionaryTest extends LuceneTestCase {
while((f = tfp.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
- assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ Field weightField = doc.getField(WEIGHT_FIELD_NAME);
+ assertEquals(tfp.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertEquals(tfp.payload(), null);
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
+
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}
@@ -123,11 +175,14 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(10000), false);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
Random rand = random();
List termsToDel = new ArrayList<>();
for(Document doc : docs.values()) {
- if(rand.nextBoolean()) {
+ StorableField f = doc.getField(FIELD_NAME);
+ if(rand.nextBoolean() && f != null && !invalidDocTerms.contains(f.stringValue())) {
termsToDel.add(doc.get(FIELD_NAME));
}
writer.addDocument(doc);
@@ -157,10 +212,16 @@ public class DocumentDictionaryTest extends LuceneTestCase {
while((f = tfp.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
- assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ Field weightField = doc.getField(WEIGHT_FIELD_NAME);
+ assertEquals(tfp.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertEquals(tfp.payload(), null);
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}