Index: solr/contrib/classification/ivy.xml =================================================================== --- solr/contrib/classification/ivy.xml (revision 0) +++ solr/contrib/classification/ivy.xml (revision 0) @@ -0,0 +1,24 @@ + + + + + + + Index: solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java =================================================================== --- solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java (revision 0) +++ solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java (revision 0) @@ -0,0 +1,81 @@ +package org.apache.solr.handler.classification; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; +import org.junit.Test; + + +/** + * Testcase for {@link ClassifierRequestHandler} + */ +public class ClassifierRequestHandlerTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void nbBeforeClass() throws Exception { + initCore("solrconfig_nb.xml", "schema_nb.xml", "classification/solr"); + } + + @Test + public void testNBBasicUsage() throws Exception { + + String idFieldName = "id"; + String textFieldName = "text"; + String classFieldName = "cat"; + assertU(adoc(idFieldName, "1", textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " + + "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " + + "the Unknown Soldier in Warsaw Tuesday.", classFieldName, "politics")); + assertU(adoc(idFieldName, "2", textFieldName, " Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" + + " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", classFieldName, "politics")); + assertU(adoc(idFieldName, "3", textFieldName, "And there's a threshold question that he has to answer for the American people and " + + "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " + + "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", + classFieldName, "politics")); + assertU(adoc(idFieldName, "4", textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " + + "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " + + "Albany's School of Criminal Justice.", classFieldName, "politics")); + assertU(adoc(idFieldName, "5", textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " + + "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " + + "world through the Internet.", classFieldName, "technology")); + assertU(adoc(idFieldName, "6", textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " + + "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", classFieldName, + "technology")); + assertU(adoc(idFieldName, "7", textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" + + " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " + + "generally transfer or store huge volumes of personal data online.", classFieldName, "technology")); + assertU(commit()); + assertQ(req("*:*"), "//*[@numFound='7']"); + assertQ(req("cat:politics"), "//*[@numFound='4']"); + + String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. "; + + SolrQueryRequest nbReq = lrf.makeRequest("textfield", textFieldName, "classfield", classFieldName, textFieldName, newText); + + try { + SolrQueryResponse response = h.queryAndResponse("nb", nbReq); + Object foundClass = response.getValues().get("found-class"); + assertNotNull(foundClass); + assertEquals("technology", foundClass); + } finally { + nbReq.close(); + } + } +} Index: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml =================================================================== --- solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml (revision 0) +++ solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml (revision 0) @@ -0,0 +1,61 @@ + + + + + ${tests.luceneMatchVersion:LUCENE_CURRENT} + + ${solr.data.dir:} + + + + + + + + + + true + 20 + 200 + false + 2 + + + + + + + + + + + + + + + + + + + + Property changes on: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml ___________________________________________________________________ Added: svn:executable + * Index: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml =================================================================== --- solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml (revision 0) +++ solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml (revision 0) @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java =================================================================== --- solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java (revision 0) +++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.solr.handler.classification; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.IndexSearcher; + +/** + * A classifier + */ +public interface Classifier { + + public String assignClass(String text) throws ClassificationException; + + public void train(IndexSearcher indexSearcher, String textFieldName, String classFieldName, Analyzer analyzer) + throws ClassificationException; + +} Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java =================================================================== --- solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java (revision 0) +++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java (revision 0) @@ -0,0 +1,188 @@ +package org.apache.solr.handler.classification; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; + +/** + * A Lucene based NaiveBayes classifier + */ +public class SimpleNaiveBayesClassifier implements Classifier { + + private Map priors; + + private IndexSearcher indexSearcher; + private String textFieldName; + private String classFieldName; + private Map classCounts; + private int docsWithClassSize; + private Analyzer analyzer; + + public void train(IndexSearcher indexSearcher, String textFieldName, String classFieldName, Analyzer analyzer) + throws ClassificationException { + this.indexSearcher = indexSearcher; + this.textFieldName = textFieldName; + this.classFieldName = classFieldName; + this.analyzer = analyzer; + try { + createVocabulary(); + preComputePriors(); + } catch (IOException e) { + throw new ClassificationException(e); + } + } + + + private void preComputePriors() throws IOException { + priors = new HashMap(); + for (String cl : classCounts.keySet()) { + priors.put(cl, calculatePrior(cl)); + } + } + + private void createVocabulary() throws IOException { + // take the existing classes + classCounts = new HashMap(); + IndexReader indexReader = indexSearcher.getIndexReader(); + indexReader.document(1); + for (int i = 0; i < indexReader.maxDoc(); i++) { + IndexableField field = indexReader.document(i).getField(classFieldName); + // if this doc has the class field + if (field != null) { + String cl = field.stringValue(); + Double cld = classCounts.get(cl); + if (cld != null) { + classCounts.put(cl, cld + 1); + } else { + classCounts.put(cl, 1d); + } + } + + } + docsWithClassSize = indexReader.maxDoc(); + } + + private String[] tokenizeDoc(String doc) throws IOException { + Collection result = new LinkedList(); + TokenStream tokenStream = analyzer.tokenStream(textFieldName, new StringReader(doc)); + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(charTermAttribute.toString()); + } + tokenStream.end(); + tokenStream.close(); + return result.toArray(new String[result.size()]); + } + + public String assignClass(String inputDocument) throws ClassificationException { + if (indexSearcher == null) { + throw new RuntimeException("need to train the classifier (use train method)"); + } + Double max = 0d; + String foundClass = null; + + for (String cl : classCounts.keySet()) { + Double clVal; + try { + // TODO : turn it to be in log scale + clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl); + } catch (IOException e) { + throw new ClassificationException(e); + } + if (clVal > max) { + max = clVal; + foundClass = cl; + } + } + return foundClass; + } + + + private Double calculateLikelihood(String document, String c) throws IOException { + // for each word + Double result = 1d; + for (String word : tokenizeDoc(document)) { + // search with text:word AND class:c + int hits = countWordInClassC(c, word); + + // num : count the no of times the word appears in documents of class c (+1) + double num = hits + 1; // +1 is added because of add 1 smoothing + + // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|) + double den = countInClassC(c) + docsWithClassSize; + + // P(w|c) = num/den + double wordProbability = num / den; + result *= wordProbability; + } + + // P(d|c) = P(w1|c)*...*P(wn|c) + return result; + } + + private double countInClassC(String c) throws IOException { + TopDocs topDocs = indexSearcher.search(new TermQuery(new Term(classFieldName, c)), Integer.MAX_VALUE); + int res = 0; + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Fields termVectors = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc); + if (termVectors != null) { + res += termVectors.terms(textFieldName).size(); + } else { + // TODO : warn about not existing term vectors for field 'textFieldName' + } + } + return res; + } + + private int countWordInClassC(String c, String word) throws IOException { + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST)); + booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST)); + TopDocs topDocs = indexSearcher.search(booleanQuery, 1); + return topDocs.totalHits; + } + + private Double calculatePrior(String currentClass) throws IOException { + return (double) docCount(currentClass) / docsWithClassSize; + } + + private int docCount(String countedClass) throws IOException { + return indexSearcher.search(new TermQuery(new Term(classFieldName, countedClass)), 1).totalHits; + } +} Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java =================================================================== --- solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java (revision 0) +++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java (revision 0) @@ -0,0 +1,36 @@ +package org.apache.solr.handler.classification; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link Exception} thrown if any errors occurs within a {@link Classifier} + */ +public class ClassificationException extends Exception { + + public ClassificationException(String s) { + super(s); + } + + public ClassificationException(String s, Throwable throwable) { + super(s, throwable); + } + + public ClassificationException(Throwable throwable) { + super(throwable); + } +} Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java =================================================================== --- solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java (revision 0) +++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.solr.handler.classification; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.IndexSearcher; +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + +/** + * A {@link RequestHandlerBase} which returns the class assigned by a certain {@link Classifier} algorithm to a given text + */ +public class ClassifierRequestHandler extends RequestHandlerBase { + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + // TODO : make all the parameters configurable + + // instantiate a new classifier + Classifier classifier = new SimpleNaiveBayesClassifier(); + + // get the training parameters from the request + IndexSearcher solrIndexSearcher = new IndexSearcher(req.getSearcher().getAtomicReader()); + String textFieldName = req.getParams().get("textfield"); + String classFieldName = req.getParams().get("classfield"); + Analyzer analyzer = req.getSchema().getAnalyzer(); + + // train + classifier.train(solrIndexSearcher, textFieldName, classFieldName, analyzer); + + // calculate class + String foundClass = classifier.assignClass(req.getParams().get("text")); + + // add result to the response + rsp.add("found-class", foundClass); + } + + @Override + public String getDescription() { + return "Solr ClassifierRequestHandler"; + } + + @Override + public String getSource() { + return "$URL"; + } + + +} Index: solr/contrib/classification/README.txt =================================================================== --- solr/contrib/classification/README.txt (revision 0) +++ solr/contrib/classification/README.txt (revision 0) @@ -0,0 +1,4 @@ +The Classification contrib plugin for Solr provides a generic mechanism for plugging in Lucene/Solr or custom classification algorithms implementations. +It currently provides classification support using a simplistic Lucene based Naive Bayes classifier. + +See http://wiki.apache.org/solr/ClassificationComponent for how to get started. Index: solr/contrib/classification/build.xml =================================================================== --- solr/contrib/classification/build.xml (revision 0) +++ solr/contrib/classification/build.xml (revision 0) @@ -0,0 +1,27 @@ + + + + + + + + Classification module + + + +