Index: lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThisUsingTags.java =================================================================== --- lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThisUsingTags.java (revision 0) +++ lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThisUsingTags.java (revision 0) @@ -0,0 +1,118 @@ +package org.apache.lucene.search.similar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similar.MoreLikeThis; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + +public class TestMoreLikeThisUsingTags extends LuceneTestCase { + private RAMDirectory directory; + private IndexReader reader; + private IndexSearcher searcher; + + protected void setUp() throws Exception { + directory = new RAMDirectory(); + TagIndexWriter writer = new TagIndexWriter(directory, new MockAnalyzer(), + MaxFieldLength.UNLIMITED); + + // Add series of docs with specific information for MoreLikeThis + addDoc(writer, "corn grain", "corn"); + addDoc(writer, "corn maize grain wheat tonnes agriculture", "corn maize"); + addDoc(writer, "corn usda agriculture", "corn"); + addDoc(writer, "usda sorghum department soybean", "sorghum"); + addDoc(writer, "soybean usda wheat grain tonnes bushels crop", "soybean"); + addDoc(writer, "wheat grain tonnes usda department crop export maize", + "maize"); + + writer.setTagFieldNames(new String[] {"tags"}); + writer.setDocTextFieldNames(new String[] {"text"}); + + writer.close(); + reader = IndexReader.open(directory, true); + searcher = new IndexSearcher(reader); + + } + + protected void tearDown() throws Exception { + reader.close(); + searcher.close(); + directory.close(); + } + + private void addDoc(IndexWriter writer, String text, String tags) + throws IOException { + Document doc = new Document(); + Field textField = new Field("text", text, Field.Store.NO, + Field.Index.ANALYZED, Field.TermVector.YES); + Field tagsField = new Field("tags", tags, Field.Store.YES, + Field.Index.ANALYZED, Field.TermVector.YES); + doc.add(textField); + doc.add(tagsField); + writer.addDocument(doc); + } + + public void testMoreLikeThisUsingTags() throws Throwable { + + MoreLikeThis mlt = new MoreLikeThis(reader); + int id = 0; + mlt.setFieldNames(new String[] {"text"}); + // mlt.setFieldNames(null); + mlt.setMinWordLen(2); + mlt.setMinTermFreq(1); + mlt.setMinDocFreq(1); + + MoreLikeThisUsingTags mltags = new MoreLikeThisUsingTags(reader, + new String[] {"tags"}); + mltags.setBoost(true); + + HashSet s1 = new HashSet(); + HashSet s2 = new HashSet(); + Query q1 = mlt.like(id); + Query q2 = mltags.like(id); + q1.extractTerms(s1); + q2.extractTerms(s2); + assertTrue( + "MoreLikeThisUsingTags query has more terms than MoreLikeThis query ", + s1.size() < s2.size()); + + } + +} Index: lucene/contrib/queries/src/java/org/apache/lucene/search/similar/TagIndexWriter.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/similar/TagIndexWriter.java (revision 0) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/similar/TagIndexWriter.java (revision 0) @@ -0,0 +1,344 @@ +package org.apache.lucene.search.similar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexDeletionPolicy; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +public class TagIndexWriter extends IndexWriter { + + /** + * Default number of document terms that map to a tag term + */ + public static final int DEFAULT_MAX_DOC_TERMS_PER_TAG_TERM = 100; + + /** + * Default set of stopwords. If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; + + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Ignore words if less than this length + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** + * Ignore words if greater than this length + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** + * Maximum number of doc terms that map to a tag term + */ + protected int maxDocTermsPerTagTerm = DEFAULT_MAX_DOC_TERMS_PER_TAG_TERM; + + public TagIndexWriter(Directory d, Analyzer a, boolean create, + IndexDeletionPolicy deletionPolicy, IndexWriter.MaxFieldLength mfl) + throws CorruptIndexException, LockObtainFailedException, IOException { + super(d, a, create, deletionPolicy, mfl); + } + + public TagIndexWriter(Directory d, Analyzer a, boolean create, + IndexWriter.MaxFieldLength mfl) throws CorruptIndexException, + LockObtainFailedException, IOException { + super(d, a, create, mfl); + } + + public TagIndexWriter(Directory d, Analyzer a, + IndexDeletionPolicy deletionPolicy, IndexWriter.MaxFieldLength mfl) + throws CorruptIndexException, LockObtainFailedException, IOException { + super(d, a, deletionPolicy, mfl); + } + + public TagIndexWriter(Directory d, Analyzer a, IndexWriter.MaxFieldLength mfl) + throws CorruptIndexException, LockObtainFailedException, IOException { + super(d, a, mfl); + } + + private String[] tagFieldNames; + + private String[] docTextFieldNames; + + public void setTagFieldNames(String[] tagFieldNames) { + this.tagFieldNames = tagFieldNames; + } + + public void setDocTextFieldNames(String[] docTextFieldNames) { + this.docTextFieldNames = docTextFieldNames; + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + private class Int { + int x; + + Int() { + x = 1; + } + } + + private class DocTermVectorMapper extends TermVectorMapper { + + private HashMap termDocCount; + private String currField; + + DocTermVectorMapper(HashMap termDocCount) { + this.termDocCount = termDocCount; + } + + public boolean isIgnoringOffsets() { + return true; + } + + public boolean isIgnoringPositions() { + return true; + } + + public void setExpectations(String field, int numTerms, + boolean storeOffsets, boolean storePositions) { + currField = field; + } + + @Override + public void map(BytesRef term, int frequency, + TermVectorOffsetInfo[] offsets, int[] positions) { + Term t = new Term(currField, term); + if (!termDocCount.containsKey(t)) termDocCount.put(t, new Int()); + else { + ((Int) termDocCount.get(t)).x++; + } + } + + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + protected class Pair implements Comparable { + Term docTerm; + Double score; + + Pair() { + docTerm = null; + score = Double.valueOf(0.0); + } + + Pair(Term docTerm, Double score) { + this.docTerm = docTerm; + this.score = score; + } + + public boolean equals(Object obj) { + if (this == obj) return true; + if ((obj == null) || (obj.getClass() != this.getClass())) return false; + // object must be Pair at this point + Pair test = (Pair) obj; + return docTerm.equals(test.docTerm) /* && score.equals(test.score) */; + } + + public int hashCode() { + int hash = 7; + hash = 31 * hash + docTerm.hashCode(); + hash = 31 * hash + score.hashCode(); + return hash; + } + + public int compareTo(Object arg0) { + if (arg0 instanceof Pair) return score.compareTo(((Pair) arg0).score); + return 0; + } + } + + /** + * determines if the passed term is likely to be of interest in "more like" + * comparisons + * + * @param term + * The word being considered + * @return true if should be ignored, false if should be used in further + * analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + if (stopWords != null && stopWords.contains(term)) { + return true; + } + return false; + } + + /** + * Stores the top docterms for a given tagterm so they can be used while + * generatinh queries using MoreLikeThisUsingTags. The top docterms are ranked + * by information gain score. + * + */ + public void writeTagInfo() throws IOException { + + IndexReader reader = IndexReader.open(getDirectory()); + + // remove previously calculated info gain + deleteDocuments(new Term("__tagterm__docterms__", "")); + + int numDocs = maxDoc(); + + // iterate over tag fields + for (int ti = 0; ti < tagFieldNames.length; ++ti) { + String tagFieldName = tagFieldNames[ti]; + Terms terms = MultiFields.getTerms(reader, tagFieldName); + TermsEnum termEnum = terms.iterator(); + BytesRef tagTermBytes = null; + // iterate over tags + while ((tagTermBytes = termEnum.next()) != null) { + // contains the information gain score for docterms associated with the + // tagterm + ArrayList list = new ArrayList(); + Term tagTerm = new Term(tagFieldName, tagTermBytes); + // if (tagTerm.field() != tagFieldName) break; + int numDocsWithTagTerm = reader.docFreq(tagTerm); + DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields + .getDeletedDocs(reader), tagFieldName, tagTermBytes); + // used to count the number of documents that a docterm appears in + HashMap termDocCount = new HashMap(); + int currDoc = 0; + while ((currDoc = docs.nextDoc()) != docs.NO_MORE_DOCS) { + // used to populate a map of a docterms and counts + // (the number of documents containing both the docterm and tagterm) + DocTermVectorMapper vMapper = new DocTermVectorMapper(termDocCount); + for (int j = 0; j < docTextFieldNames.length; ++j) { + String fieldName = docTextFieldNames[j]; + reader.getTermFreqVector(currDoc, fieldName, vMapper); + } + } + + Set entrySet = termDocCount.entrySet(); + Iterator iter = entrySet.iterator(); + while (iter.hasNext()) { + Map.Entry entry = (Map.Entry) iter.next(); + Term docTerm = (Term) entry.getKey(); + if (isNoiseWord(docTerm.text())) continue; + + int numDocsWithTagTermAndDocTerm = ((Int) entry.getValue()).x; + int numDocsWithDocTerm = reader.docFreq(docTerm); + + double score = InformationGain.getScore(numDocsWithTagTermAndDocTerm, + numDocsWithDocTerm, numDocsWithTagTerm, numDocs); + + if (!Double.isNaN(score) && !Double.isInfinite(score)) { + list.add(new Pair(docTerm, Double.valueOf(score))); + } + + } + // order the list by descending information gain scores + Collections.sort(list); + Collections.reverse(list); + + int size = list.size() < maxDocTermsPerTagTerm ? list.size() + : maxDocTermsPerTagTerm; + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < size; ++i) { + Pair p = (Pair) list.get(i); + sb.append(p.docTerm.toString()); + sb.append(":").append(p.score).append(" "); + } + Document doc = new Document(); + + Field marker = new Field("__tagterm__docterms__", "", Field.Store.YES, + Field.Index.NO, Field.TermVector.NO); + Field docterms = new Field("__docterms__", sb.toString().getBytes( + "UTF-8"), Field.Store.YES); + Field tagterm = new Field("__tagterm__", tagTerm.toString(), + Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); + doc.add(marker); + doc.add(docterms); + doc.add(tagterm); + addDocument(doc); + } + } + } + + public void close() throws CorruptIndexException, IOException { + super.commit(); + writeTagInfo(); + super.close(); + } + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + +} Index: lucene/contrib/queries/src/java/org/apache/lucene/search/similar/InformationGain.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/similar/InformationGain.java (revision 0) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/similar/InformationGain.java (revision 0) @@ -0,0 +1,70 @@ +package org.apache.lucene.search.similar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The class is used to calculate the information gain + * + * The information gain score is computed using information gain as described in + * "Machine Learning in Automated Text Categorization" by Fabrizio Sebastiani + * */ + +public class InformationGain { + + /** + * Computes the information gain score + * + * @param numDocsWithTagTermAndDocTerm + * number of documents containing docterm and tagged with tagterm + * @param numDocsWithDocTerm + * number of documents containing docterm + * @param numDocsWithTagTerm + * number of documents tagged with tagterm + * @param numDocs + * total number of documents + * @return information gain score + */ + static public double getScore(int numDocsWithTagTermAndDocTerm, + int numDocsWithDocTerm, int numDocsWithTagTerm, int numDocs) { + // probability that a document contains the docTerm + double P_term = (double) numDocsWithDocTerm / (double) numDocs; + // probability that a document DOES NOT contain the docTerm + double P_termp = 1 - P_term; + // probability that a document contains BOTH the docTerm and tagTerm + double P_tag_term = (double) numDocsWithTagTermAndDocTerm + / (double) numDocs; + // probability that a document contains the tagTerm + double P_tag = (double) numDocsWithTagTerm / (double) numDocs; + // number of documents tagged with tagTerm but NOT containing docTerm + int numDocsWithTagTermAndWithoutDocTerm = numDocsWithTagTerm + - numDocsWithTagTermAndDocTerm; + // probability that a document contains the tagTerm but DOES NOT contain the + // docTerm + double P_tag_termp = (double) numDocsWithTagTermAndWithoutDocTerm + / (double) numDocs; + + // cannot calculate information gain for the given tag + if (P_tag_term == 0.0 || P_tag_termp == 0.0) return Double.NaN; + + double score = 0.0; + // Information gain score + score = P_tag_term * Math.log(P_tag_term / (P_tag * P_term)) + P_tag_term + * Math.log(P_tag_termp / (P_tag * P_termp)); + return score; + } +} Index: lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisUsingTags.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisUsingTags.java (revision 0) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisUsingTags.java (revision 0) @@ -0,0 +1,668 @@ +package org.apache.lucene.search.similar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + +import org.apache.lucene.search.similar.MoreLikeThis; +import org.apache.lucene.store.FSDirectory; + +/** + * The class is used to generate similarity queries using the tags associated + * with documents. + * + * The document terms with the highest information gain score for a given tag + * are calculated based on the document collection. In order to generate a query + * for a document, the top document terms are calculated for the tags associated + * with the document and a query is generated. + * + * The generated "more like this" query contains the top document terms + * associated with the tags for the document collection. + * + * */ + +public class MoreLikeThisUsingTags { + + /** + * Field names + */ + protected String[] fieldNames = null; + + /** + * Ignore words if less than this length + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** + * Ignore words if greater than this length + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Default set of stopwords. If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; + + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; + + /** + * Field names that contain document tags + */ + protected String[] tagFieldNames; + + /** + * Tags that were assigned to the input document, url, file or stream. + */ + protected String[] tags; + + /** + * Default number of document terms that map to a tag term + */ + public static final int DEFAULT_MAX_DOC_TERMS_PER_TAG_TERM = 10; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default percentage of documents that a document term must appear in to be + * considered to be mapped to a tag term + */ + public static final double DEFAULT_MIN_DOCS_PERCENTAGE = 0.01; + + /** + * Returns the field names that will be used when generating the 'More Like + * This' query. The default field names that will be used is + * {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like + * This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' + * query. Set this to null for the field names to be determined at runtime + * from the IndexReader provided in the constructor. + * + * @param fieldNames + * the field names that will be used when generating the 'More Like + * This' query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum word length below which words will be ignored. Set this + * to 0 for no minimum word length. The default is + * {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the maximum doc terms mapped to a tag + * + * @param fieldNames + * the field names that will be used when generating the 'More Like + * This' query. + */ + public void setMaxDocTermsPerTagTerm(int maxDocTermsPerTagTerm) { + this.maxDocTermsPerTagTerm = maxDocTermsPerTagTerm; + } + + /** + * Returns the maximum doc terms mapped to a tag + * + * @return the minimum word length below which words will be ignored. + */ + public int getMaxDocTermsPerTagTerm() { + return maxDocTermsPerTagTerm; + } + + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen + * the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the minimum percentage of documents a word must appear in. The + * default is {@link #DEFAULT_MIN_DOCS_PERCENTAGE}. + * + * @return the maximum word length above which words will be ignored. + */ + public double getMinDocsPercentage() { + return minDocsPercentage; + } + + /** + * Sets the minimum percentage of documents in collection that a word must + * appear in + * + * @param maxWordLen + * the maximum word length above which words will be ignored. + */ + public void setMinDocsPercentage(double minDocsPercentage) { + this.minDocsPercentage = minDocsPercentage; + } + + /** + * Returns the maximum word length above which words will be ignored. Set this + * to 0 for no maximum word length. The default is + * {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen + * the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. Any word in this set is considered + * "uninteresting" and ignored. Even if your Analyzer allows stopwords, you + * might want to tell the MoreLikeThis code to ignore them, as for the + * purposes of document similarity it seems reasonable to assume that + * "a stop word is never interesting". + * + * @param stopWords + * set of stopwords, if null it means to allow stop words + * + * @see org.apache.lucene.analysis.StopFilter#makeStopSet + * StopFilter.makeStopSet() + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } + + /** + * Returns the maximum number of query terms that will be included in any + * generated query. The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any + * generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any + * generated query. + * + * @param maxQueryTerms + * the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * Map from tag term to list of document terms ordered by information gain + * score + */ + protected HashMap tagTermToTopDocTerms = new HashMap(); + + /** + * IndexSearcher used to query index + */ + protected final IndexSearcher searcher; + + /** + * IndexReader to use + */ + protected final IndexReader ir; + + /** + * Maximum number of doc terms that map to a tag term + */ + protected int maxDocTermsPerTagTerm = DEFAULT_MAX_DOC_TERMS_PER_TAG_TERM; + + /** + * Percentage of documents that a document term must appear in to be + * considered to be mapped to a tag term + */ + protected double minDocsPercentage = DEFAULT_MIN_DOCS_PERCENTAGE; + + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; + + /** + * Constructor + * + * @param ir + * IndexReader used to access index + * @param tagFieldNames + * document field names that contain tags + */ + public MoreLikeThisUsingTags(IndexReader ir, String[] tagFieldNames) { + this.ir = ir; + this.searcher = new IndexSearcher(ir); + this.tagFieldNames = tagFieldNames; + initialize(); + } + + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost + * true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } + + public void setTags(String[] tags) { + this.tags = tags; + } + + public void initialize() { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED); + fieldNames = (String[]) fields.toArray(new String[fields.size()]); + } + } + + /** + * Fetches the top doc terms from the index + * + * @param tagTerm + * the tagterm to find docterms for + * @return the docterm and score pair + */ + private Pair[] getTopDocTerms(Term tagTerm) throws Exception { + Term t = new Term("__tagterm__", tagTerm.toString()); + DocsEnum docs = MultiFields.getTermDocsEnum(ir, MultiFields + .getDeletedDocs(ir), t.field(), t.bytes()); + + int count = 0; + ArrayList pairList = new ArrayList(); + int docNum = 0; + while ((docNum = docs.nextDoc()) != docs.NO_MORE_DOCS + && count < maxDocTermsPerTagTerm) { + Document doc = ir.document(docNum); + String docterms[] = new String(doc.getBinaryValue("__docterms__"), + "UTF-8").split(" "); + for (int i = 0; i < docterms.length; ++i) { + String s = docterms[i]; + String[] toks = s.split(":"); + Term term = new Term(toks[0], toks[1]); + Double score = Double.valueOf(toks[2]); + pairList.add(new Pair(term, score)); + } + } + + int size = pairList.size() < maxDocTermsPerTagTerm ? pairList.size() + : maxDocTermsPerTagTerm; + Pair[] res = new Pair[size]; + res = (Pair[]) pairList.subList(0, size).toArray(res); + return res; + } + + /** + * Find words for a more-like-this query former. + * + * @param docNum + * the id of the lucene document from which to find terms + */ + public PriorityQueue retrieveTerms(int docNum) throws Exception { + int size = 0; + for (int i = 0; i < tagFieldNames.length; ++i) { + String field = tagFieldNames[i]; + TermFreqVector freqVector = ir.getTermFreqVector(docNum, field); + if (freqVector != null) size += freqVector.size(); + } + + FreqQ pq = new FreqQ(maxDocTermsPerTagTerm * size); + if (size == 0) return pq; + ArrayList topTerms = new ArrayList(); + for (int i = 0; i < tagFieldNames.length; ++i) { + String field = tagFieldNames[i]; + TermFreqVector termFreqVec = ir.getTermFreqVector(docNum, field); + for (int j = 0; j < termFreqVec.getTerms().length; ++j) { + BytesRef term = termFreqVec.getTerms()[j]; + Term tagTerm = new Term(field, term); + Pair[] docTermsList = getTopDocTerms(tagTerm); + for (int k = 0; k < docTermsList.length; ++k) { + Pair p = docTermsList[k]; + if (topTerms.contains(p)) { + int n = topTerms.indexOf(p); + Double d = ((Pair) topTerms.get(n)).score; + double val = d.doubleValue() + p.score.doubleValue(); + d = Double.valueOf(val); + } else { + topTerms.add(p); + } + } + } + } + + for (int i = 0; i < topTerms.size(); ++i) { + Pair p = (Pair) topTerms.get(i); + float val = (float) p.score.doubleValue(); + pq.insertWithOverflow(new Object[] {p.docTerm.text(), // the word + p.docTerm.field(), // the top field + new Float(val)}); + } + + return pq; + } + + /** + * Create the More like query from a PriorityQueue + */ + private Query createQuery(PriorityQueue q) { + BooleanQuery query = new BooleanQuery(); + Object cur; + int qterms = 0; + float bestScore = 0; + + while (((cur = q.pop()) != null)) { + Object[] ar = (Object[]) cur; + TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); + + if (boost) { + if (qterms == 0) { + bestScore = ((Float) ar[2]).floatValue(); + } + float myScore = ((Float) ar[2]).floatValue(); + + tq.setBoost(myScore / bestScore); + } + + try { + query.add(tq, BooleanClause.Occur.SHOULD); + } catch (BooleanQuery.TooManyClauses ignore) { + break; + } + + qterms++; + if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { + break; + } + } + + return query; + } + + /** + * Return a query that will return docs related to the tags passed in + * + * @param tags + * the set of tags to find related documents to + * @return a query that will return docs related to the tags passed in + * @throws Exception + */ + public Query like(String[] fields, String[] tags) throws Exception { + return createQuery(retrieveTerms(fields, tags)); + } + + /** + * Find words for a more-like-this query former. + * + * @param fields + * tag field corresponding to tag in tag array + * @param tags + * array of tags + * @throws Exception + */ + public PriorityQueue retrieveTerms(String[] fields, String[] tags) + throws Exception { + ArrayList topTerms = new ArrayList(); + FreqQ pq = new FreqQ(maxDocTermsPerTagTerm * tags.length); + if (fields.length != tags.length) return pq; + + for (int j = 0; j < fields.length; ++j) { + Term tagTerm = new Term(fields[j], tags[j]); + Pair[] docTermsList = getTopDocTerms(tagTerm); + for (int k = 0; k < docTermsList.length; ++k) { + Pair p = docTermsList[k]; + if (topTerms.contains(p)) { + int i = topTerms.indexOf(p); + Double d = ((Pair) topTerms.get(i)).score; + double val = d.doubleValue() + p.score.doubleValue(); + d = Double.valueOf(val); + } else { + topTerms.add(p); + } + } + } + + for (int i = 0; i < topTerms.size(); ++i) { + Pair p = (Pair) topTerms.get(i); + float val = p.score.floatValue(); + pq.insertWithOverflow(new Object[] {p.docTerm.text(), // the word + p.docTerm.field(), // the top field + new Float(val)}); + } + + return pq; + } + + /** + * determines if the passed term is likely to be of interest in "more like" + * comparisons + * + * @param term + * The word being considered + * @return true if should be ignored, false if should be used in further + * analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + if (stopWords != null && stopWords.contains(term)) { + return true; + } + return false; + } + + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum + * the documentID of the lucene doc to generate the 'More Like This" + * query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws Exception { + return createQuery(retrieveTerms(docNum)); + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + protected class Pair implements Comparable { + Term docTerm; + Double score; + + Pair() { + docTerm = null; + score = Double.valueOf(0.0); + } + + Pair(Term docTerm, Double score) { + this.docTerm = docTerm; + this.score = score; + } + + public boolean equals(Object obj) { + if (this == obj) return true; + if ((obj == null) || (obj.getClass() != this.getClass())) return false; + // object must be Pair at this point + Pair test = (Pair) obj; + return docTerm.equals(test.docTerm) /* && score.equals(test.score) */; + } + + public int hashCode() { + int hash = 7; + hash = 31 * hash + docTerm.hashCode(); + hash = 31 * hash + score.hashCode(); + return hash; + } + + public int compareTo(Object arg0) { + if (arg0 instanceof Pair) return score.compareTo(((Pair) arg0).score); + return 0; + } + + public String toString() { + return docTerm.toString() + " : " + score; + } + } + + /** + * PriorityQueue that orders words by score. + */ + private static class FreqQ extends PriorityQueue { + FreqQ(int s) { + initialize(s); + } + + protected boolean lessThan(Object a, Object b) { + Object[] aa = (Object[]) a; + Object[] bb = (Object[]) b; + Float fa = (Float) aa[2]; + Float fb = (Float) bb[2]; + return fa.floatValue() > fb.floatValue(); + } + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + private class Int { + int x; + + Int() { + x = 1; + } + } + +}