From 6c543861abe127fbdda1765a63262a2f8e40a368 Mon Sep 17 00:00:00 2001 From: Areek Zillur Date: Tue, 1 Oct 2013 15:51:13 -0700 Subject: [PATCH] Added DocumentDictionary impl and tests --- .../lucene/search/suggest/DocumentDictionary.java | 153 ++++++++++++++++++++ .../search/suggest/DocumentDictionaryTest.java | 156 +++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100644 lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java create mode 100644 lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java new file mode 100644 index 0000000..844017b --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java @@ -0,0 +1,153 @@ +package org.apache.lucene.search.suggest; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; + +/** + * Dictionary with terms, weights and optionally payload information + * taken from the internal Lucene index. + * + * NOTE: The term, weight and (optionally) payload fields supplied + * are required for ALL documents and has to be stored + */ +public class DocumentDictionary implements Dictionary { + + private IndexReader reader; + private String field; + private String weightField; + private String payloadField; + + /** + * Creates a new dictionary with the contents of the fields named field + * for the terms and weightField for the weights that will be used for + * the corresponding terms. + */ + public DocumentDictionary(IndexReader reader, String field, String weightField) { + this.reader = reader; + this.field = field; + this.weightField = weightField; + this.payloadField = null; + } + + /** + * Creates a new dictionary with the contents of the fields named field + * for the terms, weightField for the weights that will be used for the + * the corresponding terms and payloadField for the corresponding payloads + * for the entry. + */ + public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField) { + this.reader = reader; + this.field = field; + this.weightField = weightField; + this.payloadField = payloadField; + } + + @Override + public BytesRefIterator getWordsIterator() throws IOException { + return new TermWeightPayloadIterator(payloadField!=null); + } + + final class TermWeightPayloadIterator implements TermFreqPayloadIterator { + private final int docCount; + private final Set relevantFields; + private int currentDocId = -1; + private long currentWeight; + private BytesRef currentPayload; + private final boolean withPayload; + private final Bits liveDocs; + + /** + * Creates an iterator over term, weight and payload fields from the lucene + * index. setting withPayload to false, implies an iterator + * over only term and weight. + */ + public TermWeightPayloadIterator(boolean withPayload) throws IOException { + docCount = reader.maxDoc() - 1; + this.withPayload = withPayload; + currentPayload = null; + liveDocs = MultiFields.getLiveDocs(reader); + List relevantFieldList; + if(withPayload) { + relevantFieldList = Arrays.asList(field, weightField, payloadField); + } else { + relevantFieldList = Arrays.asList(field, weightField); + } + this.relevantFields = new HashSet<>(relevantFieldList); + } + + @Override + public long weight() { + return currentWeight; + } + + @Override + public BytesRef next() throws IOException { + while (currentDocId < docCount) { + currentDocId++; + if (liveDocs!=null && !liveDocs.get(currentDocId)) { + continue; + } + StoredDocument doc = reader.document(currentDocId, relevantFields); + if (withPayload) { + StorableField payload = doc.getField(payloadField); + if (payload==null) { + throw new IllegalArgumentException(payloadField + " does not exist"); + } + currentPayload = (payload.binaryValue() != null) ? payload.binaryValue() : + new BytesRef(payload.stringValue()); + } + StorableField weight = doc.getField(weightField); + if (weight==null) { + throw new IllegalArgumentException(weightField + " does not exist"); + } + Number weightVal = weight.numericValue(); + currentWeight = weightVal.longValue(); + StorableField fieldVal = doc.getField(field); + if (fieldVal==null) { + throw new IllegalArgumentException(field + " does not exist"); + } + + BytesRef fieldBytes = (fieldVal.binaryValue() != null) ? fieldVal.binaryValue() : + new BytesRef(fieldVal.stringValue()); + + return fieldBytes; + } + return null; + } + + @Override + public BytesRef payload() { + return currentPayload; + } + + } +} diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java new file mode 100644 index 0000000..d6cc783 --- /dev/null +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java @@ -0,0 +1,156 @@ +package org.apache.lucene.search.suggest; + +import static org.junit.Assert.*; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexDocument; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.HighFrequencyDictionary; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.DocumentDictionary; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class DocumentDictionaryTest extends LuceneTestCase { + + private List generateIndexDocuments(int ndocs) { + List docs = new ArrayList<>(); + for(int i = 0; i < ndocs ; i++) { + Field field = new TextField("f1", "field_" + i, Field.Store.YES); + Field payload = new TextField("p1", "payload_" + i, Field.Store.YES); + Field weight = new DoubleField("w1", 100d + i, Field.Store.YES); + Document doc = new Document(); + doc.add(field); + doc.add(payload); + doc.add(weight); + docs.add(doc); + } + return docs; + } + + @Test + public void testBasic() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + List docs = generateIndexDocuments(10); + for(Document doc: docs) { + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + IndexReader ir = DirectoryReader.open(dir); + Dictionary dictionary = new DocumentDictionary(ir, "f1", "w1", "p1"); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + int count = 0; + while((f = tfp.next())!=null) { + Document doc = docs.get(count); + assertTrue(f.equals(new BytesRef(doc.get("f1")))); + assertEquals(tfp.weight(), doc.getField("w1").numericValue().longValue()); + assertTrue(tfp.payload().equals(new BytesRef(doc.getField("p1").stringValue()))); + count++; + } + assertEquals(count, docs.size()); + ir.close(); + dir.close(); + } + + @Test + public void testWithoutPayload() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + List docs = generateIndexDocuments(10); + for(Document doc: docs) { + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + IndexReader ir = DirectoryReader.open(dir); + Dictionary dictionary = new DocumentDictionary(ir, "f1", "w1"); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + int count = 0; + while((f = tfp.next())!=null) { + Document doc = docs.get(count); + assertTrue(f.equals(new BytesRef(doc.get("f1")))); + assertEquals(tfp.weight(), doc.getField("w1").numericValue().longValue()); + assertEquals(tfp.payload(), null); + count++; + } + assertEquals(count, docs.size()); + ir.close(); + dir.close(); + } + + @Test + public void testWithDeletions() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + List docs = generateIndexDocuments(10); + for(Document doc: docs) { + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + IndexWriter writer2 = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + writer2.deleteDocuments(new TermQuery(new Term("f1", docs.get(0).getField("f1").stringValue()))); + writer2.commit(); + writer2.close(); + IndexReader ir = DirectoryReader.open(dir); + Dictionary dictionary = new DocumentDictionary(ir, "f1", "w1"); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + int count = 0; + docs.remove(0); + while((f = tfp.next())!=null) { + Document doc = docs.get(count); + assertTrue(f.equals(new BytesRef(doc.get("f1")))); + assertEquals(tfp.weight(), doc.getField("w1").numericValue().longValue()); + assertEquals(tfp.payload(), null); + count++; + } + assertEquals(count, docs.size()); + ir.close(); + dir.close(); + } +} -- 1.8.3.2