diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
index ce9f0fc..fba5c0f 100644
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
@@ -17,9 +17,7 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.HashSet;
-import java.util.List;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
@@ -32,14 +30,20 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/**
+ *
* Dictionary with terms, weights and optionally payload information
* taken from stored fields in a Lucene index.
- *
- * NOTE:
+ *
+ * NOTE:
*
* -
- * The term, weight and (optionally) payload fields supplied
- * are required for ALL documents and has to be stored
+ * The term, weight and (optionally) payload fields have to be
+ * stored
+ *
+ * -
+ * if any of the term, weight or (optionally) payload fields supplied
+ * do not have a value for a document, then the document is
+ * rejected by the dictionary
*
*
*/
@@ -92,7 +96,6 @@ public class DocumentDictionary implements Dictionary {
private int currentDocId = -1;
private long currentWeight;
private BytesRef currentPayload;
- private StoredDocument doc;
/**
* Creates an iterator over term, weight and payload fields from the lucene
@@ -120,28 +123,35 @@ public class DocumentDictionary implements Dictionary {
continue;
}
- doc = reader.document(currentDocId, relevantFields);
+ StoredDocument doc = reader.document(currentDocId, relevantFields);
+
+ BytesRef tempPayload = null;
+ BytesRef tempTerm = null;
+ Long tempWeight = null;
if (hasPayloads) {
StorableField payload = doc.getField(payloadField);
- if (payload == null) {
- throw new IllegalArgumentException(payloadField + " does not exist");
- } else if (payload.binaryValue() == null) {
- throw new IllegalArgumentException(payloadField + " does not have binary value");
+ if (payload == null || (payload.binaryValue() == null && payload.stringValue() == null)) {
+ continue;
}
- currentPayload = payload.binaryValue();
+ tempPayload = (payload.binaryValue() != null) ? payload.binaryValue() : new BytesRef(payload.stringValue());
}
- currentWeight = getWeight(currentDocId);
+ tempWeight = getWeight(doc, currentDocId);
+ if (tempWeight == null) {
+ continue;
+ }
StorableField fieldVal = doc.getField(field);
- if (fieldVal == null) {
- throw new IllegalArgumentException(field + " does not exist");
- } else if(fieldVal.stringValue() == null) {
- throw new IllegalArgumentException(field + " does not have string value");
+ if (fieldVal == null || (fieldVal.binaryValue() == null && fieldVal.stringValue() == null)) {
+ continue;
}
+ tempTerm = (fieldVal.stringValue() != null) ? new BytesRef(fieldVal.stringValue()) : fieldVal.binaryValue();
- return new BytesRef(fieldVal.stringValue());
+ currentPayload = tempPayload;
+ currentWeight = tempWeight;
+
+ return tempTerm;
}
return null;
}
@@ -156,13 +166,15 @@ public class DocumentDictionary implements Dictionary {
return hasPayloads;
}
- /** Return the suggestion weight for this document */
- protected long getWeight(int docId) {
+ /**
+ * Return the suggestion weight for this document.
+ * if null is returned, it indicates the weightField
+ * does not exist for the doc
+ */
+ protected Long getWeight(StoredDocument doc, int docId) {
StorableField weight = doc.getField(weightField);
- if (weight == null) {
- throw new IllegalArgumentException(weightField + " does not exist");
- } else if (weight.numericValue() == null) {
- throw new IllegalArgumentException(weightField + " does not have numeric value");
+ if (weight == null || weight.numericValue() == null) {
+ return null;
}
return weight.numericValue().longValue();
}
diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
index ea494e1..16ab935 100644
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java
@@ -30,6 +30,7 @@ import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.SortField;
@@ -37,17 +38,28 @@ import org.apache.lucene.util.BytesRefIterator;
/**
+ *
* Dictionary with terms and optionally payload information
* taken from stored fields in a Lucene index. Similar to
* {@link DocumentDictionary}, except it computes the weight
* of the terms in a document based on a user-defined expression
* having one or more {@link NumericDocValuesField} in the document.
- *
+ *
* NOTE:
*
* -
- * The term and (optionally) payload fields supplied
- * are required for ALL documents and has to be stored
+ * The term and (optionally) payload fields have to be
+ * stored
+ *
+ * -
+ * if the term or (optionally) payload fields supplied
+ * do not have a value for a document, then the document is
+ * rejected by the dictionary
+ *
+ * -
+ * All the fields used in
weightExpression should
+ * have values for all documents, if any of the fields do not
+ * have a value for a document, it will default to 0
*
*
*/
@@ -121,7 +133,7 @@ public class DocumentExpressionDictionary extends DocumentDictionary {
}
@Override
- protected long getWeight(int docId) {
+ protected Long getWeight(StoredDocument doc, int docId) {
int subIndex = ReaderUtil.subIndex(docId, starts);
if (subIndex != currentLeafIndex) {
currentLeafIndex = subIndex;
diff --git lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
index 85418ff..c30ae48 100644
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
@@ -1,8 +1,10 @@
package org.apache.lucene.search.suggest;
import java.io.IOException;
+import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -17,12 +19,14 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.suggest.DocumentDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
import org.junit.Test;
/*
@@ -48,19 +52,52 @@ public class DocumentDictionaryTest extends LuceneTestCase {
static final String WEIGHT_FIELD_NAME = "w1";
static final String PAYLOAD_FIELD_NAME = "p1";
- private Map generateIndexDocuments(int ndocs) {
+ /** Returns Pair(list of invalid document terms, Map of document term -> document) */
+ private Map.Entry, Map> generateIndexDocuments(int ndocs, boolean requiresPayload) {
Map docs = new HashMap<>();
+ List invalidDocTerms = new ArrayList<>();
for(int i = 0; i < ndocs ; i++) {
- Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
- Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
- Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i);
Document doc = new Document();
- doc.add(field);
- doc.add(payload);
- doc.add(weight);
- docs.put(field.stringValue(), doc);
+ boolean invalidDoc = false;
+ Field field = null;
+ // usually have valid term field in document
+ if (usually()) {
+ field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
+ doc.add(field);
+ } else {
+ invalidDoc = true;
+ }
+
+ // even if payload is not required usually have it
+ if (requiresPayload || usually()) {
+ // usually have valid payload field in document
+ if (usually()) {
+ Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
+ doc.add(payload);
+ } else if (requiresPayload) {
+ invalidDoc = true;
+ }
+ }
+
+ // usually have valid weight field in document
+ if (usually()) {
+ Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i);
+ doc.add(weight);
+ } else {
+ invalidDoc = true;
+ }
+
+ String term = null;
+ if (invalidDoc) {
+ term = (field!=null) ? field.stringValue() : "invalid_" + i;
+ invalidDocTerms.add(term);
+ } else {
+ term = field.stringValue();
+ }
+
+ docs.put(term, doc);
}
- return docs;
+ return new SimpleEntry, Map>(invalidDocTerms, docs);
}
@Test
@@ -69,7 +106,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(100), true);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
@@ -85,7 +124,12 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
assertTrue(tfp.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}
@@ -96,7 +140,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(100), false);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
@@ -112,7 +158,12 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
assertEquals(tfp.payload(), null);
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}
@@ -123,11 +174,14 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
- Map docs = generateIndexDocuments(10);
+ Map.Entry, Map> res = generateIndexDocuments(atLeast(100), false);
+ Map docs = res.getValue();
+ List invalidDocTerms = res.getKey();
Random rand = random();
List termsToDel = new ArrayList<>();
for(Document doc : docs.values()) {
- if(rand.nextBoolean()) {
+ StorableField f = doc.getField(FIELD_NAME);
+ if(rand.nextBoolean() && f != null && !invalidDocTerms.contains(f.stringValue())) {
termsToDel.add(doc.get(FIELD_NAME));
}
writer.addDocument(doc);
@@ -160,7 +214,12 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
assertEquals(tfp.payload(), null);
}
+
+ for (String invalidTerm : invalidDocTerms) {
+ assertNotNull(docs.remove(invalidTerm));
+ }
assertTrue(docs.isEmpty());
+
ir.close();
dir.close();
}