Index: contrib/classifier/src/test/org/apache/lucene/classifier/TestClassifier.java =================================================================== --- contrib/classifier/src/test/org/apache/lucene/classifier/TestClassifier.java (revision 0) +++ contrib/classifier/src/test/org/apache/lucene/classifier/TestClassifier.java (revision 0) @@ -0,0 +1,137 @@ +package org.apache.lucene.classifier; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.io.Reader; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-24 + * Time: 02:31:42 + */ +public class TestClassifier extends TestCase { + + public void test() throws Exception { + + InstanceFactory instanceFactory = new InstanceFactory() { + + public Document factory(String text, String _class) { + Document doc = new Document(); + doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS)); + + doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); + + doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + return doc; + } + + Analyzer analyzer = new Analyzer() { + private int minGram = 2; + private int maxGram = 3; + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream ts = new StandardTokenizer(reader); + ts = new LowerCaseFilter(ts); + if (fieldName.endsWith("/ngrams/start")) { + ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram); + } else if (fieldName.endsWith("/ngrams/inner")) { + ts = new NGramTokenFilter(ts, minGram, maxGram); + } else if (fieldName.endsWith("/ngrams/end")) { + ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram); + } + return ts; + } + }; + + public Analyzer getAnalyzer() { + return analyzer; + } + }; + + + Directory dir = new RAMDirectory(); + new IndexWriter(dir, null, true).close(); + + Instances instances = new Instances(dir, instanceFactory, "class"); + + instances.addInstance("hello world", "en"); + instances.addInstance("hallå världen", "sv"); + + instances.addInstance("this is london calling", "en"); + instances.addInstance("detta är london som ringer", "sv"); + + instances.addInstance("john has a long mustache", "en"); + instances.addInstance("john har en lång mustache", "sv"); + + instances.addInstance("all work and no play makes jack a dull boy", "en"); + instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv"); + + instances.addInstance("shrimp sandwich", "en"); + instances.addInstance("räksmörgås", "sv"); + + instances.addInstance("it's now or never", "en"); + instances.addInstance("det är nu eller aldrig", "sv"); + + instances.addInstance("to tie up at a landing-stage", "en"); + instances.addInstance("att angöra en brygga", "sv"); + + instances.addInstance("it's now time for the children's television shows", "en"); + instances.addInstance("nu är det dags för barnprogram", "sv"); + + instances.flush(); + + testClassifier(instances, new NaiveBayesClassifier()); + testClassifier(instances, new FishersMethodClassifier()); + + instances.close(); + + } + + private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException { + + assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification()); + assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification()); + + // test training data. all ought to match! + for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc(); documentNumber++) { + if (!instances.getIndexReader().isDeleted(documentNumber)) { + Map features = instances.extractFeatures(instances.getIndexReader(), documentNumber, classifier.isNormalized()); + Document document = instances.getIndexReader().document(documentNumber); + assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification()); + } + } + } + + +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/BayesianClassifier.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/BayesianClassifier.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/BayesianClassifier.java (revision 0) @@ -0,0 +1,182 @@ +package org.apache.lucene.classifier; + +import org.apache.lucene.index.*; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-30 + * Time: 22:53:58 + */ +public abstract class BayesianClassifier { + + private boolean normalized; + private double assumedFeatureProbability; + private double assumedFeatureWeight; + + protected BayesianClassifier() { + this(true, 0.4d, 1d); + } + + + protected BayesianClassifier(boolean normalized, double assumedFeatureProbability, double assumedFeatureWeight) { + this.normalized = normalized; + this.assumedFeatureProbability = assumedFeatureProbability; + this.assumedFeatureWeight = assumedFeatureWeight; + } + + /** + * @param instances + * @param featureDocs + * @param classDocs + * @param features + * @param _class + * @return + * @throws java.io.IOException + */ + public abstract double classify(Instances instances, TermDocs featureDocs, TermDocs classDocs, Map features, String _class) throws IOException; + + + public Classification[] classify(Instances instances, String text) throws IOException { + return classify(instances, instances.extractFeatures(normalized, text)); + } + + public Classification[] classify(Instances instances, Map features) throws IOException { + TermDocs featureDocs = instances.getIndexReader().termDocs(); + TermDocs classDocs = instances.getIndexReader().termDocs(); + Classification[] ret = new Classification[instances.getClasses().length]; + for (int i = 0; i < ret.length; i++) { + ret[i] = new Classification(instances.getClasses()[i], classify(instances, featureDocs, classDocs, features, instances.getClasses()[i])); + } + classDocs.close(); + featureDocs.close(); + Arrays.sort(ret); + return ret; + } + + /** + * + * @param feature + * @param _class + * @return number of occurances of feature in class + * @throws java.io.IOException + */ + public double classFeatureFrequency(Instances instances, TermDocs featureDocs, TermDocs classDocs, Term feature, String _class) throws IOException { + byte[] norms = normalized ? instances.getIndexReader().norms(feature.field()) : null; + double ret = 0d; + featureDocs.seek(feature); + classDocs.seek(new Term(instances.getClassField(), _class)); + while (featureDocs.next()) { + if (!instances.getIndexReader().isDeleted(featureDocs.doc())) { + if (classDocs.doc() == featureDocs.doc() + || (classDocs.doc() < featureDocs.doc() && classDocs.skipTo(featureDocs.doc()) && classDocs.doc() == featureDocs.doc())) { + if (normalized) { + ret += featureDocs.freq() * Similarity.decodeNorm(norms[featureDocs.doc()]); + } else { + ret += featureDocs.freq(); + } + } + } + } + return ret; + } + + + /** + * Calculates the probability for a specific feature to be a member of a specific class. + * + * @param featureDocs + * @param classDocs + * @param feature + * @param _class + * @return Pr(feature|class) + * @throws java.io.IOException + */ + public double featureClassProbability(Instances instances, TermDocs featureDocs, TermDocs classDocs, Term feature, String _class) throws IOException { + if (instances.numInstancesInClass(_class) == 0) { + return 0d; + } + return classFeatureFrequency(instances, featureDocs, classDocs, feature, _class) / (double) instances.numInstancesInClass(_class); + } + + + /** + * + * @param instances + * @param featureDocs + * @param classDocs + * @param feature + * @param _class + * @return assumed Pr(feature|class) + * @throws IOException + */ + public double weightedFeatureClassProbability(Instances instances, TermDocs featureDocs, TermDocs classDocs, Term feature, String _class) throws IOException { + double totalFeatureFrequency = 0d; + for (String _class2 : instances.getClasses()) { + totalFeatureFrequency += classFeatureFrequency(instances, featureDocs, classDocs, feature, _class2); + } + return (assumedFeatureWeight * assumedFeatureProbability + + totalFeatureFrequency + * featureClassProbability(instances, featureDocs, classDocs, feature, _class)) / (assumedFeatureWeight + totalFeatureFrequency); + } + + + /** + * + * @param instances + * @param _class + * @return Pr(class) + * @throws IOException + */ + public double classProbability(Instances instances, String _class) throws IOException { + return ((double) instances.numInstancesInClass(_class) / (double) instances.numInstances()); + } + + + public double getAssumedFeatureProbability() { + return assumedFeatureProbability; + } + + public double getAssumedFeatureWeight() { + return assumedFeatureWeight; + } + + + public void setAssumedFeatureProbability(double assumedFeatureProbability) { + this.assumedFeatureProbability = assumedFeatureProbability; + } + + public void setAssumedFeatureWeight(double assumedFeatureWeight) { + this.assumedFeatureWeight = assumedFeatureWeight; + } + + + public boolean isNormalized() { + return normalized; + } + + public void setNormalized(boolean normalized) { + this.normalized = normalized; + } +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/Instances.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/Instances.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/Instances.java (revision 0) @@ -0,0 +1,177 @@ +package org.apache.lucene.classifier; + +import org.apache.lucene.index.*; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.util.*; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-24 + * Time: 01:17:06 + */ +public class Instances { + + private Directory directory; + + private String classField; + private String[] classes; + + private InstanceFactory documentFactory; + private IndexReader indexReader; + private IndexWriter indexWriter; + + + public Instances(Directory directory, InstanceFactory documentFactory, String classField) throws IOException { + this.directory = directory; + this.documentFactory = documentFactory; + + indexReader = IndexReader.open(directory); + setClassField(classField); + } + + public void setClassField(String classField) throws IOException { + this.classField = classField.intern(); + loadClasses(); + } + + public void loadClasses() throws IOException { + List classes = new ArrayList(100); + TermEnum termEnum = indexReader.terms(); + if (termEnum.skipTo(new Term(this.classField, ""))) { + while (termEnum.term().field() == this.classField) { + classes.add(termEnum.term().text()); + if (!termEnum.next()) { + break; + } + } + } + termEnum.close(); + this.classes = classes.toArray(new String[0]); + } + + public void addInstance(String text, String _class) throws IOException { + if (indexWriter == null) { + indexWriter = new IndexWriter(directory, documentFactory.getAnalyzer(), false); + } + indexWriter.addDocument(documentFactory.factory(text, _class)); + } + + public void flush() throws IOException { + if (indexWriter != null) { + indexWriter.close(); + indexWriter = null; + indexReader = IndexReader.open(directory); + loadClasses(); + } + } + + /** + * + * @param _class + * @return number of instances in class parameter _class + * @throws java.io.IOException + */ + public int numInstancesInClass(String _class) throws IOException { + return indexReader.docFreq(new Term(classField, _class)); + } + + /** + * @return number of instances in this classifier + * @throws java.io.IOException + */ + public int numInstances() throws IOException { + return indexReader.numDocs(); + } + + + public void close() throws IOException { + flush(); + indexReader.close(); + directory.close(); + } + +// public Map extractFeatures(boolean normalized, String text) throws IOException { +// InstantiatedIndex ii = new InstantiatedIndex(); +// InstantiatedIndexWriter iiw = ii.indexWriterFactory(documentFactory.getAnalyzer(), false); +// iiw.addDocument(documentFactory.factory(text, "")); +// iiw.close(); +// InstantiatedIndexReader iir = ii.indexReaderFactory(); +// Map features = extractFeatures(normalized, iir, 0); +// iir.close(); +// ii.close(); +// return features; +// } + + public Map extractFeatures(boolean normalized, String text) throws IOException { + RAMDirectory ramDirectory = new RAMDirectory(); + IndexWriter iw = new IndexWriter(ramDirectory, getDocumentFactory().getAnalyzer(), true); + iw.addDocument(getDocumentFactory().factory(text, "")); + iw.close(); + IndexReader ir = IndexReader.open(ramDirectory); + Map features = extractFeatures(ir, 0, normalized); + ir.close(); + ramDirectory.close(); + return features; + } + + public Map extractFeatures(IndexReader indexReader, int documentNumber, boolean normalized) throws IOException { + TermFreqVector[] termFreqVectors = indexReader.getTermFreqVectors(documentNumber); + int numFeatures = 0; + for (TermFreqVector termFreqVector : termFreqVectors) { + numFeatures += termFreqVector.size(); + } + double norm = 1d; + Map features = new HashMap(numFeatures); + for (TermFreqVector termFreqVector : termFreqVectors) { + if (normalized) { + norm = Similarity.decodeNorm(indexReader.norms(termFreqVector.getField())[documentNumber]); + } + for (int i = 0; i < termFreqVector.size(); i++) { + features.put(new Term(termFreqVector.getField(), termFreqVector.getTerms()[i]), + termFreqVector.getTermFrequencies()[i] * norm); + } + } + return features; + } + + + + public String[] getClasses() throws IOException { + return classes; + } + + public String getClassField() { + return classField; + } + + public Directory getDirectory() { + return directory; + } + + public InstanceFactory getDocumentFactory() { + return documentFactory; + } + + public IndexReader getIndexReader() { + return indexReader; + } +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/FishersMethodClassifier.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/FishersMethodClassifier.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/FishersMethodClassifier.java (revision 0) @@ -0,0 +1,98 @@ +package org.apache.lucene.classifier; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; + +import java.io.IOException; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * http://en.wikipedia.org/wiki/Fisher%27s_method + * + * @author karl wettin + * Date: 2007-okt-26 + * Time: 13:24:42 + */ +public class FishersMethodClassifier extends BayesianClassifier { + + + public FishersMethodClassifier() { + super(); + } + + public FishersMethodClassifier(boolean normalized, double assumedFeatureProbability, double assumedFeatureWeight) { + super(normalized, assumedFeatureProbability, assumedFeatureWeight); + } + + /** + * + * @param instances + * @param featureDocs + * @param classDocs + * @param feature + * @param _class + * @return Pr(class|feature) + * @throws IOException + */ + public double classFeatureProbability(Instances instances, TermDocs featureDocs, TermDocs classDocs, Term feature, String _class) throws IOException { + double featureClassProbablility = featureClassProbability(instances, featureDocs, classDocs, feature, _class); + if (featureClassProbablility == 0d) { + return 0d; + } + + double featureClassProbabilitySum = 0d; + for (String _class2 : instances.getClasses()) { + featureClassProbabilitySum += featureClassProbability(instances, featureDocs, classDocs, feature, _class2); + } + + return featureClassProbablility / featureClassProbabilitySum; + } + + /** + * + * @param instances + * @param featureDocs + * @param classDocs + * @param features + * @param _class + * @return + * @throws IOException + */ + public double classify(Instances instances, TermDocs featureDocs, TermDocs classDocs, Map features, String _class) throws IOException { + double featuresClassProbability = 1d; + double distance = 0d; + + for (Map.Entry feature : features.entrySet()) { + featuresClassProbability *= weightedFeatureClassProbability(instances, featureDocs, classDocs, feature.getKey(), _class); + distance += feature.getValue(); + } + + // http://en.wikipedia.org/wiki/Inverse-chi-square_distribution + + double meanChi = (Math.log(featuresClassProbability) * -2) / 2d; + double inverse = Math.exp(-meanChi); + double sum = inverse; + for (double i = 1d; i < distance; i++) { + inverse *= meanChi / i; + sum += inverse; + } + return Math.min(sum, 1d); + + } + +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/Classification.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/Classification.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/Classification.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.classifier; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-30 + * Time: 21:59:59 + */ +public class Classification implements Comparable { + private String classification; + private Double score; + + public Classification(String classification, Double score) { + this.classification = classification; + this.score = score; + } + + public int compareTo(Classification classification) { + return classification.score.compareTo(score); + } + + + public String toString() { + return classification + " " + score; + } + + public String getClassification() { + return classification; + } + + public Double getScore() { + return score; + } +} + + Index: contrib/classifier/src/java/org/apache/lucene/classifier/NaiveBayesClassifier.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/NaiveBayesClassifier.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/NaiveBayesClassifier.java (revision 0) @@ -0,0 +1,77 @@ +package org.apache.lucene.classifier; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; + +import java.io.IOException; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * http://en.wikipedia.org/wiki/Naive_Bayes_classifier + * + * @author karl wettin + * Date: 2007-okt-24 + * Time: 05:38:00 + */ +public class NaiveBayesClassifier extends BayesianClassifier { + + + public NaiveBayesClassifier() { + super(); + } + + public NaiveBayesClassifier(boolean normalized, double assumedFeatureProbability, double assumedFeatureWeight) { + super(normalized, assumedFeatureProbability, assumedFeatureWeight); + } + + /** + * @param instances + * @param featureDocs + * @param classDocs + * @param features + * @param _class + * @return Pr(features|class) + * @throws IOException + */ + public double featuresClassProbability(Instances instances, TermDocs featureDocs, TermDocs classDocs, Map features, String _class) throws IOException { + double ret = 1; + for (Map.Entry featureFrequency : features.entrySet()) { + double wfp = weightedFeatureClassProbability(instances, featureDocs, classDocs, featureFrequency.getKey(), _class); + ret *= Math.pow(wfp, 1 + featureFrequency.getValue()); + } + return ret; + } + + /** + * + * @param instances + * @param featureDocs + * @param classDocs + * @param features + * @param _class + * @return Pr(features|class) * Pr(class) + * @throws IOException + */ + public double classify(Instances instances, TermDocs featureDocs, TermDocs classDocs, Map features, String _class) throws IOException { + return + featuresClassProbability(instances, featureDocs, classDocs, features, _class) + * classProbability(instances, _class); + } + + +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/InstanceFactory.java =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/InstanceFactory.java (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/InstanceFactory.java (revision 0) @@ -0,0 +1,32 @@ +package org.apache.lucene.classifier; + +import org.apache.lucene.document.Document; +import org.apache.lucene.analysis.Analyzer; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-30 + * Time: 22:32:34 + */ +public interface InstanceFactory { + + public abstract Document factory(String text, String _class); + public abstract Analyzer getAnalyzer(); + + +} Index: contrib/classifier/src/java/org/apache/lucene/classifier/package.html =================================================================== --- contrib/classifier/src/java/org/apache/lucene/classifier/package.html (revision 0) +++ contrib/classifier/src/java/org/apache/lucene/classifier/package.html (revision 0) @@ -0,0 +1,7 @@ + + + +Based on the Bayesian classifiers described by Toby Segaran in "Programming Collective Intelligence", O'Reilly, ISBN 978-0-596-52032-1. + + + \ No newline at end of file