Index: lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java =================================================================== --- lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (revision 1401723) +++ lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (working copy) @@ -15,23 +15,30 @@ */ package org.apache.lucene.queries.mlt; -import java.io.*; -import java.util.*; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.Document; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; @@ -135,787 +142,899 @@ * - refactor: moved common code into isNoiseWord() * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization * + * + *
+ * Changes: Ying Andrews 23/10/12
+ * Added support for multiple index readers so More Like This can generate a similary query based on multiple indexes.
+ * This extends the MoreLikeThis feature to work with lucene MultSsearcher.
+ * 
+ * For example: 
+ * Due to large size we may want to divide all sales index into: sales_1, sale_2, sales_3, ..., sales_n.
+ * In this case we would best use parallel multi-searcher to do the search. Old MoreLikeThis.java doesn't support
+ * this scenario.  If the current document of interest comes from index sales_1, then the query returned from
+ * like(int) and like(Reader, String) will only be based on index sales_1, which apparently does not reflect the
+ * entirety of the whole document population.
+ * 
+ * Modified:
+ * constructors   - MoreLikeThis(IndexReader), 
+ * 				    MoreLikeThis(IndexReader, Similarity)
+ * private method - createQueue(Map)
+ * 
+ * Added:
+ * constructors   - MoreLikeThis(IndexReader, IndexReader[]), 
+ * 					MoreLikeThis(IndexReader, IndexReader[], Similarity)
+ *				
+ * Notes: 
+ * When invoking method like(int) of this class, you have to pass in the NORMALIZED document number.
+ * You can use the same algorithm used in lucene MultiSearcher class, specifically seen in 
+ * subSearcher(int) and subDoc(int) methods.
+ * 
*/ public final class MoreLikeThis { - /** - * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. - * - * @see #getMaxNumTokensParsed - */ - public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; - /** - * Ignore terms with less than this frequency in the source doc. - * - * @see #getMinTermFreq - * @see #setMinTermFreq - */ - public static final int DEFAULT_MIN_TERM_FREQ = 2; + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; - /** - * Ignore words which do not occur in at least this many docs. - * - * @see #getMinDocFreq - * @see #setMinDocFreq - */ - public static final int DEFAULT_MIN_DOC_FREQ = 5; + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; - /** - * Ignore words which occur in more than this many docs. - * - * @see #getMaxDocFreq - * @see #setMaxDocFreq - * @see #setMaxDocFreqPct - */ - public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; - /** - * Boost terms in query based on score. - * - * @see #isBoost - * @see #setBoost - */ - public static final boolean DEFAULT_BOOST = false; + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; - /** - * Default field names. Null is used to specify that the field names should be looked - * up at runtime from the provided reader. - */ - public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; - /** - * Ignore words less than this length or if 0 then this has no effect. - * - * @see #getMinWordLen - * @see #setMinWordLen - */ - public static final int DEFAULT_MIN_WORD_LENGTH = 0; + /** + * Analyzer that will be used to parse the doc. + */ + private Analyzer analyzer = null; - /** - * Ignore words greater than this length or if 0 then this has no effect. - * - * @see #getMaxWordLen - * @see #setMaxWordLen - */ - public static final int DEFAULT_MAX_WORD_LENGTH = 0; + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - /** - * Default set of stopwords. - * If null means to allow stop words. - * - * @see #setStopWords - * @see #getStopWords - */ - public static final Set DEFAULT_STOP_WORDS = null; + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; - /** - * Current set of stop words. - */ - private Set stopWords = DEFAULT_STOP_WORDS; + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; - /** - * Return a Query with no more than this many terms. - * - * @see BooleanQuery#getMaxClauseCount - * @see #getMaxQueryTerms - * @see #setMaxQueryTerms - */ - public static final int DEFAULT_MAX_QUERY_TERMS = 25; + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; - /** - * Analyzer that will be used to parse the doc. - */ - private Analyzer analyzer = null; + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; - /** - * Ignore words less frequent that this. - */ - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - /** - * Ignore words which do not occur in at least this many docs. - */ - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - /** - * Ignore words which occur in more than this many docs. - */ - private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - /** - * Should we apply a boost to the Query based on the scores? - */ - private boolean boost = DEFAULT_BOOST; + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - /** - * Field name we'll analyze. - */ - private String[] fieldNames = DEFAULT_FIELD_NAMES; + /** + * For idf() calculations. + */ + private TFIDFSimilarity similarity;// = new DefaultSimilarity(); - /** - * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + /** + * IndexReader to use + */ + private final IndexReader ir; - /** - * Ignore words if less than this len. - */ - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + /** + * IndexReader array to use when multi-searchers are used. + */ + private final IndexReader [] irArray; - /** - * Ignore words if greater than this len. - */ - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + /** + * Boost factor to use when boosting the terms + */ + private float boostFactor = 1; - /** - * Don't return a query longer than this. - */ - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + * @see #setBoostFactor(float) + */ + public float getBoostFactor() { + return boostFactor; + } - /** - * For idf() calculations. - */ - private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + /** + * Sets the boost factor to use when boosting terms + * + * @see #getBoostFactor() + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } - /** - * IndexReader to use - */ - private final IndexReader ir; + // /** + // * Constructor requiring an IndexReader. + // */ + // public MoreLikeThis(IndexReader ir) { + // this(ir, new DefaultSimilarity()); + // } + // + // public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + // this.ir = ir; + // this.similarity = sim; + // } - /** - * Boost factor to use when boosting the terms - */ - private float boostFactor = 1; + /** + * Constructor requiring an IndexReader. + * @param ir A single indexReader to use the TermFrequiencies and inverse Document Frequencies from + */ + public MoreLikeThis(IndexReader ir) { + this(ir, null, new DefaultSimilarity()); + } - /** - * Returns the boost factor used when boosting terms - * - * @return the boost factor used when boosting terms - * @see #setBoostFactor(float) - */ - public float getBoostFactor() { - return boostFactor; - } + /** + * Constructor requiring an IndexReader and similarity. + * @param ir A single indexReader to use the TermFrequiencies and inverse Document Frequencies from + * @param sim A similarity object that contains scoring components + */ + public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){ + this(ir,null,sim); + } + /** + * Constructor requiring an array of IndexReaders to support multi-searcher, so we can + * get the correct Inverse Document Frequencies. + * + * @param ir The index reader that current document of interest comes from. + * "More Like This" tries to find similar documents to this one. + * @param irArray An array of IndexReaders that "More Like This" will use to + * look for similar documents to the one that comes from argument "ir". + * This array may contain the index reader "ir" from which + * the current document of interest comes. + * + */ + public MoreLikeThis(IndexReader ir, IndexReader[] irArray) { + this(ir, irArray, new DefaultSimilarity()); + } - /** - * Sets the boost factor to use when boosting terms - * - * @see #getBoostFactor() - */ - public void setBoostFactor(float boostFactor) { - this.boostFactor = boostFactor; - } + /** + * Added an array of IndexReaders, to support multi-searcher, so we can + * get the correct Inverse Document Frequencies. + * + * @param ir An array of IndexReaders to get the total number of documents from. + */ + public MoreLikeThis(IndexReader ir, IndexReader [] irArray, TFIDFSimilarity sim){ + this.irArray = irArray; + this.ir=ir; + this.similarity = sim; + } - /** - * Constructor requiring an IndexReader. - */ - public MoreLikeThis(IndexReader ir) { - this(ir, new DefaultSimilarity()); - } - public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { - this.ir = ir; - this.similarity = sim; - } + public TFIDFSimilarity getSimilarity() { + return similarity; + } + public void setSimilarity(TFIDFSimilarity similarity) { + this.similarity = similarity; + } - public TFIDFSimilarity getSimilarity() { - return similarity; - } + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } - public void setSimilarity(TFIDFSimilarity similarity) { - this.similarity = similarity; - } + /** + * Sets the analyzer to use. An analyzer is not required for generating a query with the + * {@link #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } - /** - * Returns an analyzer that will be used to parse source doc with. The default analyzer - * is not set. - * - * @return the analyzer that will be used to parse source doc with. - */ - public Analyzer getAnalyzer() { - return analyzer; - } + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } - /** - * Sets the analyzer to use. An analyzer is not required for generating a query with the - * {@link #like(int)} method, all other 'like' methods require an analyzer. - * - * @param analyzer the analyzer to use to tokenize text. - */ - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } - /** - * Returns the frequency below which terms will be ignored in the source doc. The default - * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. - * - * @return the frequency below which terms will be ignored in the source doc. - */ - public int getMinTermFreq() { - return minTermFreq; - } + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } - /** - * Sets the frequency below which terms will be ignored in the source doc. - * - * @param minTermFreq the frequency below which terms will be ignored in the source doc. - */ - public void setMinTermFreq(int minTermFreq) { - this.minTermFreq = minTermFreq; - } + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } - /** - * Returns the frequency at which words will be ignored which do not occur in at least this - * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. - * - * @return the frequency at which words will be ignored which do not occur in at least this - * many docs. - */ - public int getMinDocFreq() { - return minDocFreq; - } + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } - /** - * Sets the frequency at which words will be ignored which do not occur in at least this - * many docs. - * - * @param minDocFreq the frequency at which words will be ignored which do not occur in at - * least this many docs. - */ - public void setMinDocFreq(int minDocFreq) { - this.minDocFreq = minDocFreq; - } + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } - /** - * Returns the maximum frequency in which words may still appear. - * Words that appear in more than this many docs will be ignored. The default frequency is - * {@link #DEFAULT_MAX_DOC_FREQ}. - * - * @return get the maximum frequency at which words are still allowed, - * words which occur in more docs than this are ignored. - */ - public int getMaxDocFreq() { - return maxDocFreq; - } + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } - /** - * Set the maximum frequency in which words may still appear. Words that appear - * in more than this many docs will be ignored. - * - * @param maxFreq the maximum count of documents that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreq(int maxFreq) { - this.maxDocFreq = maxFreq; - } - /** - * Set the maximum percentage in which words may still appear. Words that appear - * in more than this many percent of all docs will be ignored. - * - * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreqPct(int maxPercentage) { - this.maxDocFreq = maxPercentage * ir.numDocs() / 100; - } + /** + * Returns whether to boost terms in query based on "score" or not. The default is + * {@link #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } - /** - * Returns whether to boost terms in query based on "score" or not. The default is - * {@link #DEFAULT_BOOST}. - * - * @return whether to boost terms in query based on "score" or not. - * @see #setBoost - */ - public boolean isBoost() { - return boost; - } + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } - /** - * Sets whether to boost terms in query based on "score" or not. - * - * @param boost true to boost terms in query based on "score", false otherwise. - * @see #isBoost - */ - public void setBoost(boolean boost) { - this.boost = boost; - } + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } - /** - * Returns the field names that will be used when generating the 'More Like This' query. - * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. - * - * @return the field names that will be used when generating the 'More Like This' query. - */ - public String[] getFieldNames() { - return fieldNames; - } + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no + * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } - /** - * Sets the field names that will be used when generating the 'More Like This' query. - * Set this to null for the field names to be determined at runtime from the IndexReader - * provided in the constructor. - * - * @param fieldNames the field names that will be used when generating the 'More Like This' - * query. - */ - public void setFieldNames(String[] fieldNames) { - this.fieldNames = fieldNames; - } + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } - /** - * Returns the minimum word length below which words will be ignored. Set this to 0 for no - * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. - * - * @return the minimum word length below which words will be ignored. - */ - public int getMinWordLen() { - return minWordLen; - } + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no + * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } - /** - * Sets the minimum word length below which words will be ignored. - * - * @param minWordLen the minimum word length below which words will be ignored. - */ - public void setMinWordLen(int minWordLen) { - this.minWordLen = minWordLen; - } + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } - /** - * Returns the maximum word length above which words will be ignored. Set this to 0 for no - * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. - * - * @return the maximum word length above which words will be ignored. - */ - public int getMaxWordLen() { - return maxWordLen; - } + /** + * Set the set of stopwords. + * Any word in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } - /** - * Sets the maximum word length above which words will be ignored. - * - * @param maxWordLen the maximum word length above which words will be ignored. - */ - public void setMaxWordLen(int maxWordLen) { - this.maxWordLen = maxWordLen; - } + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } - /** - * Set the set of stopwords. - * Any word in this set is considered "uninteresting" and ignored. - * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - * - * @param stopWords set of stopwords, if null it means to allow stop words - * @see #getStopWords - */ - public void setStopWords(Set stopWords) { - this.stopWords = stopWords; - } - /** - * Get the current stop words being used. - * - * @see #setStopWords - */ - public Set getStopWords() { - return stopWords; - } + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } - /** - * Returns the maximum number of query terms that will be included in any generated query. - * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. - * - * @return the maximum number of query terms that will be included in any generated query. - */ - public int getMaxQueryTerms() { - return maxQueryTerms; - } + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } - /** - * Sets the maximum number of query terms that will be included in any generated query. - * - * @param maxQueryTerms the maximum number of query terms that will be included in any - * generated query. - */ - public void setMaxQueryTerms(int maxQueryTerms) { - this.maxQueryTerms = maxQueryTerms; - } + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } - /** - * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - * @see #DEFAULT_MAX_NUM_TOKENS_PARSED - */ - public int getMaxNumTokensParsed() { - return maxNumTokensParsed; - } - /** - * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - public void setMaxNumTokensParsed(int i) { - maxNumTokensParsed = i; - } + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = MultiFields.getIndexedFields(ir); + fieldNames = fields.toArray(new String[fields.size()]); + } + return createQuery(retrieveTerms(docNum)); + } - /** - * Return a query that will return docs like the passed lucene document ID. - * - * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. - * @return a query that will return docs like the passed lucene document ID. - */ - public Query like(int docNum) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = MultiFields.getIndexedFields(ir); - fieldNames = fields.toArray(new String[fields.size()]); - } + /** + * Return a query that will return docs like the passed Reader. + * + * @return a query that will return docs like the passed Reader. + */ + public Query like(Reader r, String fieldName) throws IOException { + return createQuery(retrieveTerms(r, fieldName)); + } - return createQuery(retrieveTerms(docNum)); - } + /** + * Create the More like query from a PriorityQueue + */ + private Query createQuery(PriorityQueue q) { + BooleanQuery query = new BooleanQuery(); + Object cur; + int qterms = 0; + float bestScore = 0; - /** - * Return a query that will return docs like the passed Reader. - * - * @return a query that will return docs like the passed Reader. - */ - public Query like(Reader r, String fieldName) throws IOException { - return createQuery(retrieveTerms(r, fieldName)); - } + while ((cur = q.pop()) != null) { + Object[] ar = (Object[]) cur; + TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); - /** - * Create the More like query from a PriorityQueue - */ - private Query createQuery(PriorityQueue q) { - BooleanQuery query = new BooleanQuery(); - Object cur; - int qterms = 0; - float bestScore = 0; + if (boost) { + if (qterms == 0) { + bestScore = ((Float) ar[2]); + } + float myScore = ((Float) ar[2]); - while ((cur = q.pop()) != null) { - Object[] ar = (Object[]) cur; - TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); + tq.setBoost(boostFactor * myScore / bestScore); + } - if (boost) { - if (qterms == 0) { - bestScore = ((Float) ar[2]); - } - float myScore = ((Float) ar[2]); + try { + query.add(tq, BooleanClause.Occur.SHOULD); + } + catch (BooleanQuery.TooManyClauses ignore) { + break; + } - tq.setBoost(boostFactor * myScore / bestScore); - } + qterms++; + if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { + break; + } + } - try { - query.add(tq, BooleanClause.Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses ignore) { - break; - } + return query; + } - qterms++; - if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { - break; - } - } + /** + * Create a PriorityQueue from a word->tf map. + * + * @param words a map of words keyed on the word(String) with Int objects as the values. + */ + private PriorityQueue createQueue(Map words) throws IOException { + // have collected all words in doc and their freqs + int numDocs=0; + if (irArray == null) + numDocs = ir.numDocs(); + else { + int irArrayLength=irArray.length; + for (int irIndex=0;irIndextf map. - * - * @param words a map of words keyed on the word(String) with Int objects as the values. - */ - private PriorityQueue createQueue(Map words) throws IOException { - // have collected all words in doc and their freqs - int numDocs = ir.numDocs(); - FreqQ res = new FreqQ(words.size()); // will order words by score + int tf = words.get(word).x; // term freq in the source doc + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } - for (String word : words.keySet()) { // for every word - int tf = words.get(word).x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) { - continue; // filter out words that don't occur enough times in the source - } + // go through all the fields and find the largest document frequency + String topField = fieldNames[0]; + int docFreq = 0; + for (String fieldName : fieldNames) { + Term fieldTerm = new Term(fieldName, word); + int freq = 0; + double freqDouble = 0.0; + if (irArray == null) { + freq = ir.docFreq(fieldTerm); + } + else { + int irArrayLength = irArray.length; + int irDocFreq = 0; + double irTotal = 0;//Avoid integer division when average freq is calculated. + int irTotalTermsFound = 0; + for (int irIndex = 0; irIndex < irArrayLength; irIndex++) { + if (irArray[irIndex] != null) { + irDocFreq = irArray[irIndex].docFreq(fieldTerm); + if (irDocFreq != 0) { + irTotalTermsFound++; + irTotal += irDocFreq; + } + } + } + if (irTotalTermsFound != 0) { + freqDouble = irTotal/irTotalTermsFound; //Avoid integer division + freq = (int)(freqDouble + 0.5); //Round it to integer - // go through all the fields and find the largest document frequency - String topField = fieldNames[0]; - int docFreq = 0; - for (String fieldName : fieldNames) { - int freq = ir.docFreq(new Term(fieldName, word)); - topField = (freq > docFreq) ? fieldName : topField; - docFreq = (freq > docFreq) ? freq : docFreq; - } + } + } - if (minDocFreq > 0 && docFreq < minDocFreq) { - continue; // filter out words that don't occur in enough docs - } + topField = (freq > docFreq) ? fieldName : topField; + docFreq = (freq > docFreq) ? freq : docFreq; + } - if (docFreq > maxDocFreq) { - continue; // filter out words that occur in too many docs - } + if (minDocFreq > 0 && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } - if (docFreq == 0) { - continue; // index update problem? - } + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } - float idf = similarity.idf(docFreq, numDocs); - float score = tf * idf; + if (docFreq == 0) { + continue; // index update problem? + } - // only really need 1st 3 entries, other ones are for troubleshooting - res.insertWithOverflow(new Object[]{word, // the word - topField, // the top field - score, // overall score - idf, // idf - docFreq, // freq in all docs - tf - }); - } - return res; - } + float idf = similarity.idf(docFreq, numDocs); + float score = tf * idf; - /** - * Describe the parameters that control how the "more like this" query is formed. - */ - public String describeParams() { - StringBuilder sb = new StringBuilder(); - sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); - sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); - sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); - sb.append("\t").append("fieldNames : "); - String delim = ""; - for (String fieldName : fieldNames) { - sb.append(delim).append(fieldName); - delim = ", "; - } - sb.append("\n"); - sb.append("\t").append("boost : ").append(boost).append("\n"); - sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); - sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); - return sb.toString(); - } + // only really need 1st 3 entries, other ones are for troubleshooting + res.insertWithOverflow(new Object[]{word, // the word + topField, // the top field + score, // overall score + idf, // idf + docFreq, // freq in all docs + tf + }); + } + return res; + } - /** - * Find words for a more-like-this query former. - * - * @param docNum the id of the lucene document from which to find terms - */ - public PriorityQueue retrieveTerms(int docNum) throws IOException { - Map termFreqMap = new HashMap(); - for (String fieldName : fieldNames) { - final Fields vectors = ir.getTermVectors(docNum); - final Terms vector; - if (vectors != null) { - vector = vectors.terms(fieldName); - } else { - vector = null; - } + /** + * Describe the parameters that control how the "more like this" query is formed. + */ + public String describeParams() { + StringBuilder sb = new StringBuilder(); + sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); + sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); + sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); + sb.append("\t").append("fieldNames : "); + String delim = ""; + for (String fieldName : fieldNames) { + sb.append(delim).append(fieldName); + delim = ", "; + } + sb.append("\n"); + sb.append("\t").append("boost : ").append(boost).append("\n"); + sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); + sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); + return sb.toString(); + } - // field does not store term vector info - if (vector == null) { - StoredDocument d = ir.document(docNum); - StorableField[] fields = d.getFields(fieldName); - for (StorableField field : fields) { - final String stringValue = field.stringValue(); - if (stringValue != null) { - addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); - } - } - } else { - addTermFrequencies(termFreqMap, vector); - } - } + /** + * Find words for a more-like-this query former. + * + * @param docNum the id of the lucene document from which to find terms + */ + public PriorityQueue retrieveTerms(int docNum) throws IOException { + Map termFreqMap = new HashMap(); + for (String fieldName : fieldNames) { + final Fields vectors = ir.getTermVectors(docNum); + final Terms vector; + if (vectors != null) { + vector = vectors.terms(fieldName); + } else { + vector = null; + } - return createQueue(termFreqMap); - } + // field does not store term vector info + if (vector == null) { + StoredDocument d = ir.document(docNum); + StorableField[] fields = d.getFields(fieldName); + for (StorableField field : fields) { + final String stringValue = field.stringValue(); + if (stringValue != null) { + addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); + } + } + } else { + addTermFrequencies(termFreqMap, vector); + } + } - /** - * Adds terms and frequencies found in vector into the Map termFreqMap - * - * @param termFreqMap a Map of terms and their frequencies - * @param vector List of terms and their frequencies for a doc/field - */ - private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException { - final TermsEnum termsEnum = vector.iterator(null); - final CharsRef spare = new CharsRef(); - BytesRef text; - while((text = termsEnum.next()) != null) { - UnicodeUtil.UTF8toUTF16(text, spare); - final String term = spare.toString(); - if (isNoiseWord(term)) { - continue; - } - final int freq = (int) termsEnum.totalTermFreq(); + return createQueue(termFreqMap); + } - // increment frequency - Int cnt = termFreqMap.get(term); - if (cnt == null) { - cnt = new Int(); - termFreqMap.put(term, cnt); - cnt.x = freq; - } else { - cnt.x += freq; - } - } - } + /** + * Adds terms and frequencies found in vector into the Map termFreqMap + * + * @param termFreqMap a Map of terms and their frequencies + * @param vector List of terms and their frequencies for a doc/field + */ + private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException { + final TermsEnum termsEnum = vector.iterator(null); + final CharsRef spare = new CharsRef(); + BytesRef text; + while((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text, spare); + final String term = spare.toString(); + if (isNoiseWord(term)) { + continue; + } + final int freq = (int) termsEnum.totalTermFreq(); - /** - * Adds term frequencies found by tokenizing text from reader into the Map words - * - * @param r a source of text to be tokenized - * @param termFreqMap a Map of terms and their frequencies - * @param fieldName Used by analyzer for any special per-field analysis - */ - private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) - throws IOException { - if (analyzer == null) { - throw new UnsupportedOperationException("To use MoreLikeThis without " + - "term vectors, you must provide an Analyzer"); - } - TokenStream ts = analyzer.tokenStream(fieldName, r); - int tokenCount = 0; - // for every token - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String word = termAtt.toString(); - tokenCount++; - if (tokenCount > maxNumTokensParsed) { - break; - } - if (isNoiseWord(word)) { - continue; - } + // increment frequency + Int cnt = termFreqMap.get(term); + if (cnt == null) { + cnt = new Int(); + termFreqMap.put(term, cnt); + cnt.x = freq; + } else { + cnt.x += freq; + } + } + } - // increment frequency - Int cnt = termFreqMap.get(word); - if (cnt == null) { - termFreqMap.put(word, new Int()); - } else { - cnt.x++; - } - } - ts.end(); - ts.close(); - } + /** + * Adds term frequencies found by tokenizing text from reader into the Map words + * + * @param r a source of text to be tokenized + * @param termFreqMap a Map of terms and their frequencies + * @param fieldName Used by analyzer for any special per-field analysis + */ + private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) + throws IOException { + if (analyzer == null) { + throw new UnsupportedOperationException("To use MoreLikeThis without " + + "term vectors, you must provide an Analyzer"); + } + TokenStream ts = analyzer.tokenStream(fieldName, r); + int tokenCount = 0; + // for every token + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + String word = termAtt.toString(); + tokenCount++; + if (tokenCount > maxNumTokensParsed) { + break; + } + if (isNoiseWord(word)) { + continue; + } + // increment frequency + Int cnt = termFreqMap.get(word); + if (cnt == null) { + termFreqMap.put(word, new Int()); + } else { + cnt.x++; + } + } + ts.end(); + ts.close(); + } - /** - * determines if the passed term is likely to be of interest in "more like" comparisons - * - * @param term The word being considered - * @return true if should be ignored, false if should be used in further analysis - */ - private boolean isNoiseWord(String term) { - int len = term.length(); - if (minWordLen > 0 && len < minWordLen) { - return true; - } - if (maxWordLen > 0 && len > maxWordLen) { - return true; - } - return stopWords != null && stopWords.contains(term); - } + /** + * determines if the passed term is likely to be of interest in "more like" comparisons + * + * @param term The word being considered + * @return true if should be ignored, false if should be used in further analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + return stopWords != null && stopWords.contains(term); + } - /** - * Find words for a more-like-this query former. - * The result is a priority queue of arrays with one entry for every word in the document. - * Each array has 6 elements. - * The elements are: - *
    - *
  1. The word (String) - *
  2. The top field that this word comes from (String) - *
  3. The score for this word (Float) - *
  4. The IDF value (Float) - *
  5. The frequency of this word in the index (Integer) - *
  6. The frequency of this word in the source document (Integer) - *
- * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. - * This method is exposed so that you can identify the "interesting words" in a document. - * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. - * - * @param r the reader that has the content of the document - * @param fieldName field passed to the analyzer to use when analyzing the content - * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first - * @see #retrieveInterestingTerms - */ - public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { - Map words = new HashMap(); - addTermFrequencies(r, words, fieldName); - return createQueue(words); - } - /** - * @see #retrieveInterestingTerms(java.io.Reader, String) - */ - public String[] retrieveInterestingTerms(int docNum) throws IOException { - ArrayList al = new ArrayList(maxQueryTerms); - PriorityQueue pq = retrieveTerms(docNum); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } + /** + * Find words for a more-like-this query former. + * The result is a priority queue of arrays with one entry for every word in the document. + * Each array has 6 elements. + * The elements are: + *
    + *
  1. The word (String) + *
  2. The top field that this word comes from (String) + *
  3. The score for this word (Float) + *
  4. The IDF value (Float) + *
  5. The frequency of this word in the index (Integer) + *
  6. The frequency of this word in the source document (Integer) + *
+ * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. + * This method is exposed so that you can identify the "interesting words" in a document. + * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. + * + * @param r the reader that has the content of the document + * @param fieldName field passed to the analyzer to use when analyzing the content + * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first + * @see #retrieveInterestingTerms + */ + public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { + Map words = new HashMap(); + addTermFrequencies(r, words, fieldName); + return createQueue(words); + } - /** - * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. - * - * @param r the source document - * @param fieldName field passed to analyzer to use when analyzing the content - * @return the most interesting words in the document - * @see #retrieveTerms(java.io.Reader, String) - * @see #setMaxQueryTerms - */ - public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { - ArrayList al = new ArrayList(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r, fieldName); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } + /** + * @see #retrieveInterestingTerms(java.io.Reader, String) + */ + public String[] retrieveInterestingTerms(int docNum) throws IOException { + ArrayList al = new ArrayList(maxQueryTerms); + PriorityQueue pq = retrieveTerms(docNum); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } - /** - * PriorityQueue that orders words by score. - */ - private static class FreqQ extends PriorityQueue { - FreqQ(int s) { - super(s); - } + /** + * Convenience routine to make it easy to return the most interesting words in a document. + * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. + * + * @param r the source document + * @param fieldName field passed to analyzer to use when analyzing the content + * @return the most interesting words in the document + * @see #retrieveTerms(java.io.Reader, String) + * @see #setMaxQueryTerms + */ + public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { + ArrayList al = new ArrayList(maxQueryTerms); + PriorityQueue pq = retrieveTerms(r, fieldName); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } - @Override - protected boolean lessThan(Object[] aa, Object[] bb) { - Float fa = (Float) aa[2]; - Float fb = (Float) bb[2]; - return fa > fb; - } - } + /** + * PriorityQueue that orders words by score. + */ + private static class FreqQ extends PriorityQueue { + FreqQ(int s) { + super(s); + } - /** - * Use for frequencies and to avoid renewing Integers. - */ - private static class Int { - int x; + @Override + protected boolean lessThan(Object[] aa, Object[] bb) { + Float fa = (Float) aa[2]; + Float fb = (Float) bb[2]; + return fa > fb; + } + } - Int() { - x = 1; - } - } + /** + * Use for frequencies and to avoid renewing Integers. + */ + private static class Int { + int x; + + Int() { + x = 1; + } + } }