Uploaded image for project: 'Lucene - Core'
  1. Lucene - Core
  2. LUCENE-1039

Bayesian classifiers using Lucene as data store

Details

    • New Feature
    • Status: Reopened
    • Minor
    • Resolution: Unresolved
    • None
    • None
    • core/store
    • None
    • New, Patch Available

    Description

      Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and Fisher method algorithms as described by Toby Segaran in "Programming Collective Intelligence", ISBN 978-0-596-52932-1.

      Have fun.

      Poor java docs, but the TestCase shows how to use it:

      public class TestClassifier extends TestCase {
      
        public void test() throws Exception {
      
          InstanceFactory instanceFactory = new InstanceFactory() {
      
            public Document factory(String text, String _class) {
              Document doc = new Document();
              doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS));
      
              doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO));
      
              doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
              doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
              doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
              return doc;
            }
      
            Analyzer analyzer = new Analyzer() {
              private int minGram = 2;
              private int maxGram = 3;
      
              public TokenStream tokenStream(String fieldName, Reader reader) {
                TokenStream ts = new StandardTokenizer(reader);
                ts = new LowerCaseFilter(ts);
                if (fieldName.endsWith("/ngrams/start")) {
                  ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram);
                } else if (fieldName.endsWith("/ngrams/inner")) {
                  ts = new NGramTokenFilter(ts, minGram, maxGram);
                } else if (fieldName.endsWith("/ngrams/end")) {
                  ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram);
                }
                return ts;
              }
            };
      
            public Analyzer getAnalyzer() {
              return analyzer;
            }
          };
      
          Directory dir = new RAMDirectory();
          new IndexWriter(dir, null, true).close();
      
          Instances instances = new Instances(dir, instanceFactory, "class");
      
          instances.addInstance("hello world", "en");
          instances.addInstance("hallå världen", "sv");
      
          instances.addInstance("this is london calling", "en");
          instances.addInstance("detta är london som ringer", "sv");
      
          instances.addInstance("john has a long mustache", "en");
          instances.addInstance("john har en lång mustache", "sv");
      
          instances.addInstance("all work and no play makes jack a dull boy", "en");
          instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv");
      
          instances.addInstance("shrimp sandwich", "en");
          instances.addInstance("räksmörgås", "sv");
      
          instances.addInstance("it's now or never", "en");
          instances.addInstance("det är nu eller aldrig", "sv");
      
          instances.addInstance("to tie up at a landing-stage", "en");
          instances.addInstance("att angöra en brygga", "sv");
      
          instances.addInstance("it's now time for the children's television shows", "en");
          instances.addInstance("nu är det dags för barnprogram", "sv");
      
          instances.flush();
      
          testClassifier(instances, new NaiveBayesClassifier());
          testClassifier(instances, new FishersMethodClassifier());
      
          instances.close();
        }
      
        private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException {
      
          assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification());
          assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification());
      
          // test training data instances. all ought to match!
          for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc(); documentNumber++) {
            if (!instances.getIndexReader().isDeleted(documentNumber)) {
              Map<Term, Double> features = instances.extractFeatures(instances.getIndexReader(), documentNumber, classifier.isNormalized());
              Document document = instances.getIndexReader().document(documentNumber);
              assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification());
            }
          }
        }
      
      

      Attachments

        1. LUCENE-1039.txt
          27 kB
          Karl Wettin

        Activity

          People

            karl.wettin Karl Wettin
            karl.wettin Karl Wettin
            Votes:
            1 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated: