Index: lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java =================================================================== --- lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (revision 1401723) +++ lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (working copy) @@ -15,23 +15,30 @@ */ package org.apache.lucene.queries.mlt; -import java.io.*; -import java.util.*; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.Document; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; @@ -135,787 +142,899 @@ * - refactor: moved common code into isNoiseWord() * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization * + * + *
+ * Changes: Ying Andrews 23/10/12 + * Added support for multiple index readers so More Like This can generate a similary query based on multiple indexes. + * This extends the MoreLikeThis feature to work with lucene MultSsearcher. + * + * For example: + * Due to large size we may want to divide all sales index into: sales_1, sale_2, sales_3, ..., sales_n. + * In this case we would best use parallel multi-searcher to do the search. Old MoreLikeThis.java doesn't support + * this scenario. If the current document of interest comes from index sales_1, then the query returned from + * like(int) and like(Reader, String) will only be based on index sales_1, which apparently does not reflect the + * entirety of the whole document population. + * + * Modified: + * constructors - MoreLikeThis(IndexReader), + * MoreLikeThis(IndexReader, Similarity) + * private method - createQueue(Map*/ public final class MoreLikeThis { - /** - * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. - * - * @see #getMaxNumTokensParsed - */ - public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; - /** - * Ignore terms with less than this frequency in the source doc. - * - * @see #getMinTermFreq - * @see #setMinTermFreq - */ - public static final int DEFAULT_MIN_TERM_FREQ = 2; + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; - /** - * Ignore words which do not occur in at least this many docs. - * - * @see #getMinDocFreq - * @see #setMinDocFreq - */ - public static final int DEFAULT_MIN_DOC_FREQ = 5; + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; - /** - * Ignore words which occur in more than this many docs. - * - * @see #getMaxDocFreq - * @see #setMaxDocFreq - * @see #setMaxDocFreqPct - */ - public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set> DEFAULT_STOP_WORDS = null; - /** - * Boost terms in query based on score. - * - * @see #isBoost - * @see #setBoost - */ - public static final boolean DEFAULT_BOOST = false; + /** + * Current set of stop words. + */ + private Set> stopWords = DEFAULT_STOP_WORDS; - /** - * Default field names. Null is used to specify that the field names should be looked - * up at runtime from the provided reader. - */ - public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; - /** - * Ignore words less than this length or if 0 then this has no effect. - * - * @see #getMinWordLen - * @see #setMinWordLen - */ - public static final int DEFAULT_MIN_WORD_LENGTH = 0; + /** + * Analyzer that will be used to parse the doc. + */ + private Analyzer analyzer = null; - /** - * Ignore words greater than this length or if 0 then this has no effect. - * - * @see #getMaxWordLen - * @see #setMaxWordLen - */ - public static final int DEFAULT_MAX_WORD_LENGTH = 0; + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - /** - * Default set of stopwords. - * If null means to allow stop words. - * - * @see #setStopWords - * @see #getStopWords - */ - public static final Set> DEFAULT_STOP_WORDS = null; + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; - /** - * Current set of stop words. - */ - private Set> stopWords = DEFAULT_STOP_WORDS; + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; - /** - * Return a Query with no more than this many terms. - * - * @see BooleanQuery#getMaxClauseCount - * @see #getMaxQueryTerms - * @see #setMaxQueryTerms - */ - public static final int DEFAULT_MAX_QUERY_TERMS = 25; + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; - /** - * Analyzer that will be used to parse the doc. - */ - private Analyzer analyzer = null; + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; - /** - * Ignore words less frequent that this. - */ - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - /** - * Ignore words which do not occur in at least this many docs. - */ - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - /** - * Ignore words which occur in more than this many docs. - */ - private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - /** - * Should we apply a boost to the Query based on the scores? - */ - private boolean boost = DEFAULT_BOOST; + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - /** - * Field name we'll analyze. - */ - private String[] fieldNames = DEFAULT_FIELD_NAMES; + /** + * For idf() calculations. + */ + private TFIDFSimilarity similarity;// = new DefaultSimilarity(); - /** - * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + /** + * IndexReader to use + */ + private final IndexReader ir; - /** - * Ignore words if less than this len. - */ - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + /** + * IndexReader array to use when multi-searchers are used. + */ + private final IndexReader [] irArray; - /** - * Ignore words if greater than this len. - */ - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + /** + * Boost factor to use when boosting the terms + */ + private float boostFactor = 1; - /** - * Don't return a query longer than this. - */ - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + * @see #setBoostFactor(float) + */ + public float getBoostFactor() { + return boostFactor; + } - /** - * For idf() calculations. - */ - private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + /** + * Sets the boost factor to use when boosting terms + * + * @see #getBoostFactor() + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } - /** - * IndexReader to use - */ - private final IndexReader ir; + // /** + // * Constructor requiring an IndexReader. + // */ + // public MoreLikeThis(IndexReader ir) { + // this(ir, new DefaultSimilarity()); + // } + // + // public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + // this.ir = ir; + // this.similarity = sim; + // } - /** - * Boost factor to use when boosting the terms - */ - private float boostFactor = 1; + /** + * Constructor requiring an IndexReader. + * @param ir A single indexReader to use the TermFrequiencies and inverse Document Frequencies from + */ + public MoreLikeThis(IndexReader ir) { + this(ir, null, new DefaultSimilarity()); + } - /** - * Returns the boost factor used when boosting terms - * - * @return the boost factor used when boosting terms - * @see #setBoostFactor(float) - */ - public float getBoostFactor() { - return boostFactor; - } + /** + * Constructor requiring an IndexReader and similarity. + * @param ir A single indexReader to use the TermFrequiencies and inverse Document Frequencies from + * @param sim A similarity object that contains scoring components + */ + public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){ + this(ir,null,sim); + } + /** + * Constructor requiring an array of IndexReaders to support multi-searcher, so we can + * get the correct Inverse Document Frequencies. + * + * @param ir The index reader that current document of interest comes from. + * "More Like This" tries to find similar documents to this one. + * @param irArray An array of IndexReaders that "More Like This" will use to + * look for similar documents to the one that comes from argument "ir". + * This array may contain the index reader "ir" from which + * the current document of interest comes. + * + */ + public MoreLikeThis(IndexReader ir, IndexReader[] irArray) { + this(ir, irArray, new DefaultSimilarity()); + } - /** - * Sets the boost factor to use when boosting terms - * - * @see #getBoostFactor() - */ - public void setBoostFactor(float boostFactor) { - this.boostFactor = boostFactor; - } + /** + * Added an array of IndexReaders, to support multi-searcher, so we can + * get the correct Inverse Document Frequencies. + * + * @param ir An array of IndexReaders to get the total number of documents from. + */ + public MoreLikeThis(IndexReader ir, IndexReader [] irArray, TFIDFSimilarity sim){ + this.irArray = irArray; + this.ir=ir; + this.similarity = sim; + } - /** - * Constructor requiring an IndexReader. - */ - public MoreLikeThis(IndexReader ir) { - this(ir, new DefaultSimilarity()); - } - public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { - this.ir = ir; - this.similarity = sim; - } + public TFIDFSimilarity getSimilarity() { + return similarity; + } + public void setSimilarity(TFIDFSimilarity similarity) { + this.similarity = similarity; + } - public TFIDFSimilarity getSimilarity() { - return similarity; - } + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } - public void setSimilarity(TFIDFSimilarity similarity) { - this.similarity = similarity; - } + /** + * Sets the analyzer to use. An analyzer is not required for generating a query with the + * {@link #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } - /** - * Returns an analyzer that will be used to parse source doc with. The default analyzer - * is not set. - * - * @return the analyzer that will be used to parse source doc with. - */ - public Analyzer getAnalyzer() { - return analyzer; - } + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } - /** - * Sets the analyzer to use. An analyzer is not required for generating a query with the - * {@link #like(int)} method, all other 'like' methods require an analyzer. - * - * @param analyzer the analyzer to use to tokenize text. - */ - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } - /** - * Returns the frequency below which terms will be ignored in the source doc. The default - * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. - * - * @return the frequency below which terms will be ignored in the source doc. - */ - public int getMinTermFreq() { - return minTermFreq; - } + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } - /** - * Sets the frequency below which terms will be ignored in the source doc. - * - * @param minTermFreq the frequency below which terms will be ignored in the source doc. - */ - public void setMinTermFreq(int minTermFreq) { - this.minTermFreq = minTermFreq; - } + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } - /** - * Returns the frequency at which words will be ignored which do not occur in at least this - * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. - * - * @return the frequency at which words will be ignored which do not occur in at least this - * many docs. - */ - public int getMinDocFreq() { - return minDocFreq; - } + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } - /** - * Sets the frequency at which words will be ignored which do not occur in at least this - * many docs. - * - * @param minDocFreq the frequency at which words will be ignored which do not occur in at - * least this many docs. - */ - public void setMinDocFreq(int minDocFreq) { - this.minDocFreq = minDocFreq; - } + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } - /** - * Returns the maximum frequency in which words may still appear. - * Words that appear in more than this many docs will be ignored. The default frequency is - * {@link #DEFAULT_MAX_DOC_FREQ}. - * - * @return get the maximum frequency at which words are still allowed, - * words which occur in more docs than this are ignored. - */ - public int getMaxDocFreq() { - return maxDocFreq; - } + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } - /** - * Set the maximum frequency in which words may still appear. Words that appear - * in more than this many docs will be ignored. - * - * @param maxFreq the maximum count of documents that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreq(int maxFreq) { - this.maxDocFreq = maxFreq; - } - /** - * Set the maximum percentage in which words may still appear. Words that appear - * in more than this many percent of all docs will be ignored. - * - * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreqPct(int maxPercentage) { - this.maxDocFreq = maxPercentage * ir.numDocs() / 100; - } + /** + * Returns whether to boost terms in query based on "score" or not. The default is + * {@link #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } - /** - * Returns whether to boost terms in query based on "score" or not. The default is - * {@link #DEFAULT_BOOST}. - * - * @return whether to boost terms in query based on "score" or not. - * @see #setBoost - */ - public boolean isBoost() { - return boost; - } + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } - /** - * Sets whether to boost terms in query based on "score" or not. - * - * @param boost true to boost terms in query based on "score", false otherwise. - * @see #isBoost - */ - public void setBoost(boolean boost) { - this.boost = boost; - } + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } - /** - * Returns the field names that will be used when generating the 'More Like This' query. - * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. - * - * @return the field names that will be used when generating the 'More Like This' query. - */ - public String[] getFieldNames() { - return fieldNames; - } + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no + * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } - /** - * Sets the field names that will be used when generating the 'More Like This' query. - * Set this to null for the field names to be determined at runtime from the IndexReader - * provided in the constructor. - * - * @param fieldNames the field names that will be used when generating the 'More Like This' - * query. - */ - public void setFieldNames(String[] fieldNames) { - this.fieldNames = fieldNames; - } + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } - /** - * Returns the minimum word length below which words will be ignored. Set this to 0 for no - * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. - * - * @return the minimum word length below which words will be ignored. - */ - public int getMinWordLen() { - return minWordLen; - } + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no + * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } - /** - * Sets the minimum word length below which words will be ignored. - * - * @param minWordLen the minimum word length below which words will be ignored. - */ - public void setMinWordLen(int minWordLen) { - this.minWordLen = minWordLen; - } + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } - /** - * Returns the maximum word length above which words will be ignored. Set this to 0 for no - * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. - * - * @return the maximum word length above which words will be ignored. - */ - public int getMaxWordLen() { - return maxWordLen; - } + /** + * Set the set of stopwords. + * Any word in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set> stopWords) { + this.stopWords = stopWords; + } - /** - * Sets the maximum word length above which words will be ignored. - * - * @param maxWordLen the maximum word length above which words will be ignored. - */ - public void setMaxWordLen(int maxWordLen) { - this.maxWordLen = maxWordLen; - } + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set> getStopWords() { + return stopWords; + } - /** - * Set the set of stopwords. - * Any word in this set is considered "uninteresting" and ignored. - * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - * - * @param stopWords set of stopwords, if null it means to allow stop words - * @see #getStopWords - */ - public void setStopWords(Set> stopWords) { - this.stopWords = stopWords; - } - /** - * Get the current stop words being used. - * - * @see #setStopWords - */ - public Set> getStopWords() { - return stopWords; - } + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } - /** - * Returns the maximum number of query terms that will be included in any generated query. - * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. - * - * @return the maximum number of query terms that will be included in any generated query. - */ - public int getMaxQueryTerms() { - return maxQueryTerms; - } + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } - /** - * Sets the maximum number of query terms that will be included in any generated query. - * - * @param maxQueryTerms the maximum number of query terms that will be included in any - * generated query. - */ - public void setMaxQueryTerms(int maxQueryTerms) { - this.maxQueryTerms = maxQueryTerms; - } + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } - /** - * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - * @see #DEFAULT_MAX_NUM_TOKENS_PARSED - */ - public int getMaxNumTokensParsed() { - return maxNumTokensParsed; - } - /** - * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - public void setMaxNumTokensParsed(int i) { - maxNumTokensParsed = i; - } + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection) + * + * Added: + * constructors - MoreLikeThis(IndexReader, IndexReader[]), + * MoreLikeThis(IndexReader, IndexReader[], Similarity) + * + * Notes: + * When invoking method like(int) of this class, you have to pass in the NORMALIZED document number. + * You can use the same algorithm used in lucene MultiSearcher class, specifically seen in + * subSearcher(int) and subDoc(int) methods. + *