Index: solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java =================================================================== --- solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java (revision 1147761) +++ solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java (working copy) @@ -367,7 +367,8 @@ public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List filters, List terms, int flags ) throws IOException { - rawMLTQuery = mlt.like(reader); + // analyzing with the first field: previous (stupid) behavior + rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]); boostedMLTQuery = getBoostedQuery( rawMLTQuery ); if( terms != null ) { fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms ); Index: modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java =================================================================== --- modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java (revision 1147761) +++ modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java (working copy) @@ -87,7 +87,7 @@ mlt.setBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery) mlt.like(new StringReader( - "lucene release")); + "lucene release"), "text"); List clauses = query.clauses(); assertEquals("Expected " + originalValues.size() + " clauses.", @@ -115,7 +115,7 @@ mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like(new StringReader( - "lucene release")); + "lucene release"), "text"); List clauses = query.clauses(); for (BooleanClause clause : clauses) { Index: modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java =================================================================== --- modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (revision 1147768) +++ modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (working copy) @@ -574,45 +574,12 @@ } /** - * Return a query that will return docs like the passed file. - * - * @return a query that will return docs like the passed file. - */ - public Query like(File f) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED); - fieldNames = fields.toArray(new String[fields.size()]); - } - - return like(new FileReader(f)); - } - - /** - * Return a query that will return docs like the passed URL. - * - * @return a query that will return docs like the passed URL. - */ - public Query like(URL u) throws IOException { - return like(new InputStreamReader(u.openConnection().getInputStream())); - } - - /** - * Return a query that will return docs like the passed stream. - * - * @return a query that will return docs like the passed stream. - */ - public Query like(java.io.InputStream is) throws IOException { - return like(new InputStreamReader(is)); - } - - /** * Return a query that will return docs like the passed Reader. * * @return a query that will return docs like the passed Reader. */ - public Query like(Reader r) throws IOException { - return createQuery(retrieveTerms(r)); + public Query like(Reader r, String fieldName) throws IOException { + return createQuery(retrieveTerms(r, fieldName)); } /** @@ -727,65 +694,6 @@ } /** - * Test driver. - * Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". - */ - public static void main(String[] a) throws Throwable { - String indexName = "localhost_index"; - String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; - URL url = null; - for (int i = 0; i < a.length; i++) { - if (a[i].equals("-i")) { - indexName = a[++i]; - } else if (a[i].equals("-f")) { - fn = a[++i]; - } else if (a[i].equals("-url")) { - url = new URL(a[++i]); - } - } - - PrintStream o = System.out; - FSDirectory dir = FSDirectory.open(new File(indexName)); - IndexReader r = IndexReader.open(dir, true); - o.println("Open index " + indexName + " which has " + r.numDocs() + " docs"); - - MoreLikeThis mlt = new MoreLikeThis(r); - - o.println("Query generation parameters:"); - o.println(mlt.describeParams()); - o.println(); - - Query query = null; - if (url != null) { - o.println("Parsing URL: " + url); - query = mlt.like(url); - } else if (fn != null) { - o.println("Parsing file: " + fn); - query = mlt.like(new File(fn)); - } - - o.println("q: " + query); - o.println(); - IndexSearcher searcher = new IndexSearcher(dir, true); - - TopDocs hits = searcher.search(query, null, 25); - int len = hits.totalHits; - o.println("found: " + len + " documents matching"); - o.println(); - ScoreDoc[] scoreDocs = hits.scoreDocs; - for (int i = 0; i < Math.min(25, len); i++) { - Document d = searcher.doc(scoreDocs[i].doc); - String summary = d.get("summary"); - o.println("score : " + scoreDocs[i].score); - o.println("url : " + d.get("url")); - o.println("\ttitle : " + d.get("title")); - if (summary != null) - o.println("\tsummary: " + d.get("summary")); - o.println(); - } - } - - /** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms @@ -918,14 +826,13 @@ * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. * * @param r the reader that has the content of the document + * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * @see #retrieveInterestingTerms */ - public PriorityQueue retrieveTerms(Reader r) throws IOException { + public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { Map words = new HashMap(); - for (String fieldName : fieldNames) { - addTermFrequencies(r, words, fieldName); - } + addTermFrequencies(r, words, fieldName); return createQueue(words); } @@ -948,16 +855,17 @@ /** * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly. + * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document + * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader) * @see #setMaxQueryTerms */ - public String[] retrieveInterestingTerms(Reader r) throws IOException { + public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList al = new ArrayList(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r); + PriorityQueue pq = retrieveTerms(r, fieldName); Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words Index: modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java =================================================================== --- modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java (revision 1147768) +++ modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java (working copy) @@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.StringReader; import java.util.Set; /** @@ -40,6 +41,7 @@ private String likeText; private String[] moreLikeFields; private Analyzer analyzer; + private String fieldName; private float percentTermsToMatch = 0.3f; private int minTermFrequency = 1; private int maxQueryTerms = 5; @@ -49,10 +51,11 @@ /** * @param moreLikeFields */ - public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) { + public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) { this.likeText = likeText; this.moreLikeFields = moreLikeFields; this.analyzer = analyzer; + this.fieldName = fieldName; } @Override @@ -67,7 +70,7 @@ } mlt.setMaxQueryTerms(maxQueryTerms); mlt.setStopWords(stopWords); - BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes())); + BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName); BooleanClause[] clauses = bq.getClauses(); //make at least half the terms match bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); Index: lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java =================================================================== --- lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (revision 1147761) +++ lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java (working copy) @@ -96,7 +96,7 @@ } - MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer); + MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]); mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms)); mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency)); mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);