Details
-
New Feature
-
Status: Reopened
-
Minor
-
Resolution: Unresolved
-
None
-
None
-
None
-
New, Patch Available
Description
Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and Fisher method algorithms as described by Toby Segaran in "Programming Collective Intelligence", ISBN 978-0-596-52932-1.
Have fun.
Poor java docs, but the TestCase shows how to use it:
public class TestClassifier extends TestCase { public void test() throws Exception { InstanceFactory instanceFactory = new InstanceFactory() { public Document factory(String text, String _class) { Document doc = new Document(); doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS)); doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); return doc; } Analyzer analyzer = new Analyzer() { private int minGram = 2; private int maxGram = 3; public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream ts = new StandardTokenizer(reader); ts = new LowerCaseFilter(ts); if (fieldName.endsWith("/ngrams/start")) { ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram); } else if (fieldName.endsWith("/ngrams/inner")) { ts = new NGramTokenFilter(ts, minGram, maxGram); } else if (fieldName.endsWith("/ngrams/end")) { ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram); } return ts; } }; public Analyzer getAnalyzer() { return analyzer; } }; Directory dir = new RAMDirectory(); new IndexWriter(dir, null, true).close(); Instances instances = new Instances(dir, instanceFactory, "class"); instances.addInstance("hello world", "en"); instances.addInstance("hallå världen", "sv"); instances.addInstance("this is london calling", "en"); instances.addInstance("detta är london som ringer", "sv"); instances.addInstance("john has a long mustache", "en"); instances.addInstance("john har en lång mustache", "sv"); instances.addInstance("all work and no play makes jack a dull boy", "en"); instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv"); instances.addInstance("shrimp sandwich", "en"); instances.addInstance("räksmörgås", "sv"); instances.addInstance("it's now or never", "en"); instances.addInstance("det är nu eller aldrig", "sv"); instances.addInstance("to tie up at a landing-stage", "en"); instances.addInstance("att angöra en brygga", "sv"); instances.addInstance("it's now time for the children's television shows", "en"); instances.addInstance("nu är det dags för barnprogram", "sv"); instances.flush(); testClassifier(instances, new NaiveBayesClassifier()); testClassifier(instances, new FishersMethodClassifier()); instances.close(); } private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException { assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification()); assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification()); // test training data instances. all ought to match! for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc(); documentNumber++) { if (!instances.getIndexReader().isDeleted(documentNumber)) { Map<Term, Double> features = instances.extractFeatures(instances.getIndexReader(), documentNumber, classifier.isNormalized()); Document document = instances.getIndexReader().document(documentNumber); assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification()); } } }