Index: solr/contrib/classification/ivy.xml
===================================================================
--- solr/contrib/classification/ivy.xml (revision 0)
+++ solr/contrib/classification/ivy.xml (revision 0)
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
Index: solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java
===================================================================
--- solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java (revision 0)
+++ solr/contrib/classification/src/test/org/apache/solr/handler/classification/ClassifierRequestHandlerTest.java (revision 0)
@@ -0,0 +1,81 @@
+package org.apache.solr.handler.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+/**
+ * Testcase for {@link ClassifierRequestHandler}
+ */
+public class ClassifierRequestHandlerTest extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void nbBeforeClass() throws Exception {
+ initCore("solrconfig_nb.xml", "schema_nb.xml", "classification/solr");
+ }
+
+ @Test
+ public void testNBBasicUsage() throws Exception {
+
+ String idFieldName = "id";
+ String textFieldName = "text";
+ String classFieldName = "cat";
+ assertU(adoc(idFieldName, "1", textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
+ "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
+ "the Unknown Soldier in Warsaw Tuesday.", classFieldName, "politics"));
+ assertU(adoc(idFieldName, "2", textFieldName, " Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
+ " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", classFieldName, "politics"));
+ assertU(adoc(idFieldName, "3", textFieldName, "And there's a threshold question that he has to answer for the American people and " +
+ "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
+ "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"",
+ classFieldName, "politics"));
+ assertU(adoc(idFieldName, "4", textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
+ "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
+ "Albany's School of Criminal Justice.", classFieldName, "politics"));
+ assertU(adoc(idFieldName, "5", textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
+ "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
+ "world through the Internet.", classFieldName, "technology"));
+ assertU(adoc(idFieldName, "6", textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
+ "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", classFieldName,
+ "technology"));
+ assertU(adoc(idFieldName, "7", textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
+ " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
+ "generally transfer or store huge volumes of personal data online.", classFieldName, "technology"));
+ assertU(commit());
+ assertQ(req("*:*"), "//*[@numFound='7']");
+ assertQ(req("cat:politics"), "//*[@numFound='4']");
+
+ String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
+
+ SolrQueryRequest nbReq = lrf.makeRequest("textfield", textFieldName, "classfield", classFieldName, textFieldName, newText);
+
+ try {
+ SolrQueryResponse response = h.queryAndResponse("nb", nbReq);
+ Object foundClass = response.getValues().get("found-class");
+ assertNotNull(foundClass);
+ assertEquals("technology", foundClass);
+ } finally {
+ nbReq.close();
+ }
+ }
+}
Index: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml
===================================================================
--- solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml (revision 0)
+++ solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml (revision 0)
@@ -0,0 +1,61 @@
+
+
+
+
+ ${tests.luceneMatchVersion:LUCENE_CURRENT}
+
+ ${solr.data.dir:}
+
+
+
+
+
+
+
+
+
+ true
+ 20
+ 200
+ false
+ 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Property changes on: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/solrconfig_nb.xml
___________________________________________________________________
Added: svn:executable
+ *
Index: solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml
===================================================================
--- solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml (revision 0)
+++ solr/contrib/classification/src/test-files/classification/solr/collection1/conf/schema_nb.xml (revision 0)
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java
===================================================================
--- solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java (revision 0)
+++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/Classifier.java (revision 0)
@@ -0,0 +1,33 @@
+package org.apache.solr.handler.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.IndexSearcher;
+
+/**
+ * A classifier
+ */
+public interface Classifier {
+
+ public String assignClass(String text) throws ClassificationException;
+
+ public void train(IndexSearcher indexSearcher, String textFieldName, String classFieldName, Analyzer analyzer)
+ throws ClassificationException;
+
+}
Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java
===================================================================
--- solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java (revision 0)
+++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/SimpleNaiveBayesClassifier.java (revision 0)
@@ -0,0 +1,182 @@
+package org.apache.solr.handler.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+/**
+ * A Lucene based NaiveBayes classifier
+ */
+public class SimpleNaiveBayesClassifier implements Classifier {
+
+ private Map priors;
+
+ private IndexSearcher indexSearcher;
+ private String textFieldName;
+ private String classFieldName;
+ private Map classCounts;
+ private int docsWithClassSize;
+ private Analyzer analyzer;
+
+ public void train(IndexSearcher indexSearcher, String textFieldName, String classFieldName, Analyzer analyzer)
+ throws ClassificationException {
+ this.indexSearcher = indexSearcher;
+ this.textFieldName = textFieldName;
+ this.classFieldName = classFieldName;
+ this.analyzer = analyzer;
+ try {
+ createVocabulary();
+ preComputePriors();
+ } catch (IOException e) {
+ throw new ClassificationException(e);
+ }
+ }
+
+
+ private void preComputePriors() throws IOException {
+ priors = new HashMap();
+ for (String cl : classCounts.keySet()) {
+ priors.put(cl, calculatePrior(cl));
+ }
+ }
+
+ private void createVocabulary() throws IOException {
+ // take the existing classes
+ classCounts = new HashMap();
+ TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);
+ for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+ IndexableField field = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
+ // if this doc has the class field
+ if (field != null) {
+ String cl = field.stringValue();
+ Double cld = classCounts.get(cl);
+ if (cld != null) {
+ classCounts.put(cl, cld + 1);
+ } else {
+ classCounts.put(cl, 1d);
+ }
+ }
+
+ }
+ docsWithClassSize = topDocs.totalHits;
+ }
+
+ private String[] tokenizeDoc(String doc) throws IOException {
+ Collection result = new LinkedList();
+ TokenStream tokenStream = analyzer.tokenStream(textFieldName, new StringReader(doc));
+ while (tokenStream.incrementToken()) {
+ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+ result.add(charTermAttribute.toString());
+ }
+ return result.toArray(new String[result.size()]);
+ }
+
+ public String assignClass(String inputDocument) throws ClassificationException {
+ if (indexSearcher == null) {
+ throw new RuntimeException("need to train the classifier (use train method)");
+ }
+ Double max = 0d;
+ String foundClass = null;
+
+ for (String cl : classCounts.keySet()) {
+ Double clVal = null;
+ try {
+ clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl);
+ } catch (IOException e) {
+ throw new ClassificationException(e);
+ }
+ if (clVal > max) {
+ max = clVal;
+ foundClass = cl;
+ }
+ }
+ return foundClass;
+ }
+
+
+ private Double calculateLikelihood(String document, String c) throws IOException {
+ // for each word
+ Double result = 1d;
+ for (String word : tokenizeDoc(document)) {
+ // search with text:word AND class:c
+ int hits = countWordInClassC(c, word);
+
+ // num : count the no of times the word appears in documents of class c (+1)
+ double num = hits + 1; // +1 is added because of add 1 smoothing
+
+ // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+ double den = countInClassC(c) + docsWithClassSize;
+
+ // P(w|c) = num/den
+ double wordProbability = num / den;
+ result *= wordProbability;
+ }
+
+ // P(d|c) = P(w1|c)*...*P(wn|c)
+ return result;
+ }
+
+ private double countInClassC(String c) throws IOException {
+ TopDocs topDocs = indexSearcher.search(new TermQuery(new Term(classFieldName, c)), Integer.MAX_VALUE);
+ int res = 0;
+ for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+ Fields termVectors = indexSearcher.getIndexReader().getTermVectors(scoreDoc.doc);
+ if (termVectors != null) {
+ res += termVectors.terms(textFieldName).size();
+ } else {
+ // warn about not existing term vectors for field 'textFieldName'
+ }
+ }
+ return res;
+ }
+
+ private int countWordInClassC(String c, String word) throws IOException {
+ BooleanQuery booleanQuery = new BooleanQuery();
+ booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST));
+ booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
+ TopDocs topDocs = indexSearcher.search(booleanQuery, 1);
+ return topDocs.totalHits;
+ }
+
+ private Double calculatePrior(String currentClass) throws IOException {
+ return (double) docCount(currentClass) / docsWithClassSize;
+ }
+
+ private int docCount(String countedClass) throws IOException {
+ return indexSearcher.search(new TermQuery(new Term(classFieldName, countedClass)), 1).totalHits;
+ }
+}
Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java
===================================================================
--- solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java (revision 0)
+++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassificationException.java (revision 0)
@@ -0,0 +1,36 @@
+package org.apache.solr.handler.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An {@link Exception} thrown if any errors occurs within a {@link Classifier}
+ */
+public class ClassificationException extends Exception {
+
+ public ClassificationException(String s) {
+ super(s);
+ }
+
+ public ClassificationException(String s, Throwable throwable) {
+ super(s, throwable);
+ }
+
+ public ClassificationException(Throwable throwable) {
+ super(throwable);
+ }
+}
Index: solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java
===================================================================
--- solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java (revision 0)
+++ solr/contrib/classification/src/java/org/apache/solr/handler/classification/ClassifierRequestHandler.java (revision 0)
@@ -0,0 +1,65 @@
+package org.apache.solr.handler.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.solr.handler.RequestHandlerBase;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+/**
+ * A {@link RequestHandlerBase} which returns the class assigned by a certain {@link Classifier} algorithm to a given text
+ */
+public class ClassifierRequestHandler extends RequestHandlerBase {
+
+ @Override
+ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
+ // TODO : make all the parameters configurable
+
+ // instantiate a new classifier
+ Classifier classifier = new SimpleNaiveBayesClassifier();
+
+ // get the training parameters from the request
+ IndexSearcher solrIndexSearcher = new IndexSearcher(req.getSearcher().getAtomicReader());
+ String textFieldName = req.getParams().get("textfield");
+ String classFieldName = req.getParams().get("classfield");
+ Analyzer analyzer = req.getSchema().getAnalyzer();
+
+ // train
+ classifier.train(solrIndexSearcher, textFieldName, classFieldName, analyzer);
+
+ // calculate class
+ String foundClass = classifier.assignClass(req.getParams().get("text"));
+
+ // add result to the response
+ rsp.add("found-class", foundClass);
+ }
+
+ @Override
+ public String getDescription() {
+ return "Solr ClassifierRequestHandler";
+ }
+
+ @Override
+ public String getSource() {
+ return "$URL";
+ }
+
+
+}
Index: solr/contrib/classification/README.txt
===================================================================
--- solr/contrib/classification/README.txt (revision 0)
+++ solr/contrib/classification/README.txt (revision 0)
@@ -0,0 +1,4 @@
+The Classification contrib plugin for Solr provides a generic mechanism for plugging in Lucene/Solr or custom classification algorithms implementations.
+It currently provides classification support using a simplistic Lucene based Naive Bayes classifier.
+
+See http://wiki.apache.org/solr/ClassificationComponent for how to get started.
Index: solr/contrib/classification/build.xml
===================================================================
--- solr/contrib/classification/build.xml (revision 0)
+++ solr/contrib/classification/build.xml (revision 0)
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+ Classification module
+
+
+
+