diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java index ce9f0fc..fba5c0f 100644 --- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java @@ -17,9 +17,7 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; -import java.util.List; import java.util.Set; import org.apache.lucene.index.IndexReader; @@ -32,14 +30,20 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; /** + *

* Dictionary with terms, weights and optionally payload information * taken from stored fields in a Lucene index. - * - * NOTE: + *

+ * NOTE: * */ @@ -92,7 +96,6 @@ public class DocumentDictionary implements Dictionary { private int currentDocId = -1; private long currentWeight; private BytesRef currentPayload; - private StoredDocument doc; /** * Creates an iterator over term, weight and payload fields from the lucene @@ -120,28 +123,35 @@ public class DocumentDictionary implements Dictionary { continue; } - doc = reader.document(currentDocId, relevantFields); + StoredDocument doc = reader.document(currentDocId, relevantFields); + + BytesRef tempPayload = null; + BytesRef tempTerm = null; + Long tempWeight = null; if (hasPayloads) { StorableField payload = doc.getField(payloadField); - if (payload == null) { - throw new IllegalArgumentException(payloadField + " does not exist"); - } else if (payload.binaryValue() == null) { - throw new IllegalArgumentException(payloadField + " does not have binary value"); + if (payload == null || (payload.binaryValue() == null && payload.stringValue() == null)) { + continue; } - currentPayload = payload.binaryValue(); + tempPayload = (payload.binaryValue() != null) ? payload.binaryValue() : new BytesRef(payload.stringValue()); } - currentWeight = getWeight(currentDocId); + tempWeight = getWeight(doc, currentDocId); + if (tempWeight == null) { + continue; + } StorableField fieldVal = doc.getField(field); - if (fieldVal == null) { - throw new IllegalArgumentException(field + " does not exist"); - } else if(fieldVal.stringValue() == null) { - throw new IllegalArgumentException(field + " does not have string value"); + if (fieldVal == null || (fieldVal.binaryValue() == null && fieldVal.stringValue() == null)) { + continue; } + tempTerm = (fieldVal.stringValue() != null) ? new BytesRef(fieldVal.stringValue()) : fieldVal.binaryValue(); - return new BytesRef(fieldVal.stringValue()); + currentPayload = tempPayload; + currentWeight = tempWeight; + + return tempTerm; } return null; } @@ -156,13 +166,15 @@ public class DocumentDictionary implements Dictionary { return hasPayloads; } - /** Return the suggestion weight for this document */ - protected long getWeight(int docId) { + /** + * Return the suggestion weight for this document. + * if null is returned, it indicates the weightField + * does not exist for the doc + */ + protected Long getWeight(StoredDocument doc, int docId) { StorableField weight = doc.getField(weightField); - if (weight == null) { - throw new IllegalArgumentException(weightField + " does not exist"); - } else if (weight.numericValue() == null) { - throw new IllegalArgumentException(weightField + " does not have numeric value"); + if (weight == null || weight.numericValue() == null) { + return null; } return weight.numericValue().longValue(); } diff --git lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java index ea494e1..16ab935 100644 --- lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentExpressionDictionary.java @@ -30,6 +30,7 @@ import org.apache.lucene.expressions.js.JavascriptCompiler; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.StoredDocument; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.SortField; @@ -37,17 +38,28 @@ import org.apache.lucene.util.BytesRefIterator; /** + *

* Dictionary with terms and optionally payload information * taken from stored fields in a Lucene index. Similar to * {@link DocumentDictionary}, except it computes the weight * of the terms in a document based on a user-defined expression * having one or more {@link NumericDocValuesField} in the document. - * + *

* NOTE: * */ @@ -121,7 +133,7 @@ public class DocumentExpressionDictionary extends DocumentDictionary { } @Override - protected long getWeight(int docId) { + protected Long getWeight(StoredDocument doc, int docId) { int subIndex = ReaderUtil.subIndex(docId, starts); if (subIndex != currentLeafIndex) { currentLeafIndex = subIndex; diff --git lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java index 85418ff..c30ae48 100644 --- lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java @@ -1,8 +1,10 @@ package org.apache.lucene.search.suggest; import java.io.IOException; +import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; @@ -17,12 +19,14 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.StorableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.suggest.DocumentDictionary; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -48,19 +52,52 @@ public class DocumentDictionaryTest extends LuceneTestCase { static final String WEIGHT_FIELD_NAME = "w1"; static final String PAYLOAD_FIELD_NAME = "p1"; - private Map generateIndexDocuments(int ndocs) { + /** Returns Pair(list of invalid document terms, Map of document term -> document) */ + private Map.Entry, Map> generateIndexDocuments(int ndocs, boolean requiresPayload) { Map docs = new HashMap<>(); + List invalidDocTerms = new ArrayList<>(); for(int i = 0; i < ndocs ; i++) { - Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES); - Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i)); - Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i); Document doc = new Document(); - doc.add(field); - doc.add(payload); - doc.add(weight); - docs.put(field.stringValue(), doc); + boolean invalidDoc = false; + Field field = null; + // usually have valid term field in document + if (usually()) { + field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES); + doc.add(field); + } else { + invalidDoc = true; + } + + // even if payload is not required usually have it + if (requiresPayload || usually()) { + // usually have valid payload field in document + if (usually()) { + Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i)); + doc.add(payload); + } else if (requiresPayload) { + invalidDoc = true; + } + } + + // usually have valid weight field in document + if (usually()) { + Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i); + doc.add(weight); + } else { + invalidDoc = true; + } + + String term = null; + if (invalidDoc) { + term = (field!=null) ? field.stringValue() : "invalid_" + i; + invalidDocTerms.add(term); + } else { + term = field.stringValue(); + } + + docs.put(term, doc); } - return docs; + return new SimpleEntry, Map>(invalidDocTerms, docs); } @Test @@ -69,7 +106,9 @@ public class DocumentDictionaryTest extends LuceneTestCase { IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); - Map docs = generateIndexDocuments(10); + Map.Entry, Map> res = generateIndexDocuments(atLeast(100), true); + Map docs = res.getValue(); + List invalidDocTerms = res.getKey(); for(Document doc: docs.values()) { writer.addDocument(doc); } @@ -85,7 +124,12 @@ public class DocumentDictionaryTest extends LuceneTestCase { assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); assertTrue(tfp.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue())); } + + for (String invalidTerm : invalidDocTerms) { + assertNotNull(docs.remove(invalidTerm)); + } assertTrue(docs.isEmpty()); + ir.close(); dir.close(); } @@ -96,7 +140,9 @@ public class DocumentDictionaryTest extends LuceneTestCase { IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); - Map docs = generateIndexDocuments(10); + Map.Entry, Map> res = generateIndexDocuments(atLeast(100), false); + Map docs = res.getValue(); + List invalidDocTerms = res.getKey(); for(Document doc: docs.values()) { writer.addDocument(doc); } @@ -112,7 +158,12 @@ public class DocumentDictionaryTest extends LuceneTestCase { assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); assertEquals(tfp.payload(), null); } + + for (String invalidTerm : invalidDocTerms) { + assertNotNull(docs.remove(invalidTerm)); + } assertTrue(docs.isEmpty()); + ir.close(); dir.close(); } @@ -123,11 +174,14 @@ public class DocumentDictionaryTest extends LuceneTestCase { IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); iwc.setMergePolicy(newLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); - Map docs = generateIndexDocuments(10); + Map.Entry, Map> res = generateIndexDocuments(atLeast(100), false); + Map docs = res.getValue(); + List invalidDocTerms = res.getKey(); Random rand = random(); List termsToDel = new ArrayList<>(); for(Document doc : docs.values()) { - if(rand.nextBoolean()) { + StorableField f = doc.getField(FIELD_NAME); + if(rand.nextBoolean() && f != null && !invalidDocTerms.contains(f.stringValue())) { termsToDel.add(doc.get(FIELD_NAME)); } writer.addDocument(doc); @@ -160,7 +214,12 @@ public class DocumentDictionaryTest extends LuceneTestCase { assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); assertEquals(tfp.payload(), null); } + + for (String invalidTerm : invalidDocTerms) { + assertNotNull(docs.remove(invalidTerm)); + } assertTrue(docs.isEmpty()); + ir.close(); dir.close(); }