Index: contrib/clusterer/src/test/org/apache/lucene/analysis/TestMarkovChainTokenFilter.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/analysis/TestMarkovChainTokenFilter.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/analysis/TestMarkovChainTokenFilter.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis; + +import junit.framework.TestCase; + +import java.io.StringReader; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +public class TestMarkovChainTokenFilter extends TestCase { + + public void test() throws Exception { + + String text = "this is a test"; + String[] expected = new String[]{ + "this", "this is", "this is a", + "is", "is a", "is a test", + "a", "a test", + "test" + }; + + TokenStream ts = new WhitespaceTokenizer(new StringReader(text)); + ts = new MarkovChainTokenFilter(ts, 1, 3); + + for (String termText : expected) { + Token token = ts.next(); + assertEquals(termText, new String(token.termBuffer(), 0, token.termLength())); + } + + assertNull(ts.next()); + + } + +} Index: contrib/clusterer/src/test/org/apache/lucene/index/TestTokenStreamTermVectorMapper.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/index/TestTokenStreamTermVectorMapper.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/index/TestTokenStreamTermVectorMapper.java (revision 0) @@ -0,0 +1,66 @@ +package org.apache.lucene.index; + +import junit.framework.TestCase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.util.Collections; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-15 + * Time: 02:53:10 + */ +public class TestTokenStreamTermVectorMapper extends TestCase { + + public void test() throws Exception { + + Directory directory = new RAMDirectory(); + IndexWriter indexWriter = new IndexWriter(directory, new StandardAnalyzer(Collections.emptySet()), true); + + String title = "this is the actual title we want"; + String[] tokens = new String[]{"this", "is", "the", "actual", "title", "we", "want"}; + + Document document = new Document(); + document.add(new Field("title", title, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + indexWriter.addDocument(document); + indexWriter.close(); + + IndexReader indexReader = IndexReader.open(directory); + TokenStreamTermVectorMapper mapper = new TokenStreamTermVectorMapper(); + indexReader.getTermFreqVector(0, mapper); + + TokenStream ts = mapper.tokenStreamFactory(); + for (String termText : tokens) { + Token token = ts.next(); + assertEquals(termText, new String(token.termBuffer(), 0, token.termLength())); + } + assertNull(ts.next()); + + indexReader.close(); + + directory.close(); + + } + +} Index: contrib/clusterer/src/test/org/apache/lucene/index/TestTermVectorAccessor.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/index/TestTermVectorAccessor.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/index/TestTermVectorAccessor.java (revision 0) @@ -0,0 +1,112 @@ +package org.apache.lucene.index; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.util.Collections; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +public class TestTermVectorAccessor extends TestCase { + + public void test() throws Exception { + + Directory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Collections.EMPTY_SET), true); + + Document doc; + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + iw.addDocument(doc); + + iw.close(); + + IndexReader ir = IndexReader.open(dir); + + TermVectorAccessor accessor = new TermVectorAccessor(); + + ParallelArrayTermVectorMapper mapper; + TermFreqVector tfv; + + for (int i = 0; i < ir.maxDoc(); i++) { + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "a", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i,"a", tfv.getTerms()[0]); + assertEquals("doc " + i, 8, tfv.getTermFrequencies()[0]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "b", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); + assertEquals("doc " + i,"b", tfv.getTerms()[1]); + assertEquals("doc " + i, 7, tfv.getTermFrequencies()[1]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "c", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); + assertEquals("doc " + i,"c", tfv.getTerms()[2]); + assertEquals("doc " + i, 7, tfv.getTermFrequencies()[2]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "q", mapper); + tfv = mapper.materializeVector(); + assertNull("doc " + i, tfv); + + } + + + ir.close(); + + dir.close(); + + + } + +} Index: contrib/clusterer/src/test/org/apache/lucene/clusterer/tree/TestTree.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/clusterer/tree/TestTree.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/clusterer/tree/TestTree.java (revision 0) @@ -0,0 +1,284 @@ +package org.apache.lucene.clusterer.tree; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.MarkovChainTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IdxReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.io.*; +import java.text.DecimalFormat; +import java.util.*; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 21:37:13 + */ +public class TestTree extends TestCase { + + public void test() throws Exception { + + Set instanceIdentities = new HashSet(10000); + + System.out.println("Indexing.."); + + IdxReader idxReader = new IdxReader(); + + Directory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(), true); + + index("/Users/kalle/projekt/data/news/extracted/040101", idxReader, iw, instanceIdentities); + index("/Users/kalle/projekt/data/news/extracted/040102", idxReader, iw, instanceIdentities); + index("/Users/kalle/projekt/data/news/extracted/040103", idxReader, iw, instanceIdentities); +// index("/Users/kalle/projekt/data/news/extracted/040104", idxReader, iw, instanceIdentities); +// index("/Users/kalle/projekt/data/news/extracted/040105", idxReader, iw, instanceIdentities); +// index("/Users/kalle/projekt/data/news/extracted/040106", idxReader, iw, instanceIdentities); +// index("/Users/kalle/projekt/data/news/extracted/040107", idxReader, iw, instanceIdentities); +// index("/Users/kalle/projekt/data/news/extracted/040108", idxReader, iw, instanceIdentities); + + + iw.close(); + + System.out.println("Training " + instanceIdentities.size() + " instances.."); + + Writer insertTimer = new OutputStreamWriter(new FileOutputStream("insertMsCurve.txt"), "UTF8"); + Writer rebuildTimer = new OutputStreamWriter(new FileOutputStream("rebuildMsCurve.txt"), "UTF8"); + + IndexReader ir = IndexReader.open(dir); + Tree tree = new Tree(new String[]{"body/stemmed"}); + + double sum = 0; + int i = 0; + int cnt = 0; + for (Term instanceIdentity : instanceIdentities) { + try { + long ms = System.currentTimeMillis(); + Instance instance = tree.insertFromRoot(ir, instanceIdentity, insertTimer, rebuildTimer); + sum += System.currentTimeMillis() - ms; + + instance.setTitle(ir.document(tree.findDocumentNumberByDocumentIdentity(ir, instanceIdentity)).get("title")); + + cnt++; + + if (i++ == 100) { + System.out.println(cnt + "\t" + (sum / 100d)); + sum = 0; + i = 0; + insertTimer.flush(); + rebuildTimer.flush(); + } + +// if (cnt > 1000) { +// break; +// } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + insertTimer.close(); + rebuildTimer.close(); + + ir.close(); + + dir.close(); + + + for (double d = 1; d <= 6d; d += 0.5d) { + + String fileName = "/Users/kalle/tmp/out_" + d + ".txt"; + + TreeClusterer treeClusterer = extractClusters(tree, fileName, d); + +// Tree clusterTree = new Tree(tree.getFields()); +// for (TreeCluster cluster : treeClusterer.getClusters()) { +// Map>[] vectorspaces = new Map[cluster.getInstances().size()]; +// for (i = 0; i < cluster.getInstances().size(); i++) { +// vectorspaces[i] = cluster.getInstances().get(i).getVectorSpace(); +// } +// Map> meanVectorSpace = tree.calculateMeanVectorSpace(vectorspaces); +// Instance instance = new Instance(new Term("cluster", "a"), meanVectorSpace); +// instance.setTitle(cluster.getInstances().get(0).getTitle() + "(" + cluster.getInstances().size() + ")"); +// clusterTree.insertFromRoot(instance); +// } +// +// for (double d2 = 1; d2 <= 6d; d2 += 0.5d) { +// fileName = "/Users/kalle/tmp/out_clusters_" + d + "_" + d2 + ".txt"; +// extractClusters(clusterTree, fileName, d2); +// } + + } + + } + + private TreeClusterer extractClusters(Tree tree, String fileName, double d) throws IOException { + System.out.println("Extracting clusters.."); + + TreeClusterer clusterer = new TreeClusterer(tree, d); + clusterer.resolveAllClusters(); + + List clusters = new ArrayList(clusterer.getClusters()); + Collections.sort(clusters, new Comparator() { + + public int compare(TreeCluster treeCluster, TreeCluster treeCluster1) { + return new Integer(treeCluster1.getInstances().size()).compareTo(treeCluster.getInstances().size()); + } + }); + + + writeClustersOutput(fileName, clusterer, clusters); + + + return clusterer; + + + } + + + private static final DecimalFormat df = new DecimalFormat("#.##"); + + private void writeClustersOutput(String fileName, TreeClusterer clusterer, List clusters) throws IOException { + + System.out.println("Creating file " + fileName + ".."); + + + Writer fw = new OutputStreamWriter(new FileOutputStream(new File(fileName)), "UTF8"); + + + int i = 0; + for (TreeCluster cluster : clusters) { + fw.write(String.valueOf(i++)); + fw.write("\n"); + for (Instance node : cluster.getInstances()) { + fw.write("\t"); + fw.write(node.getTitle()); + fw.write("\t"); + fw.write(node.getIdentity().text()); + fw.write("\n"); + } + for (Map.Entry node : cluster.getBoundaries().entrySet()) { + TreeCluster boundaryCluster = clusterer.getClusterByNode().get(node.getKey()); + if (boundaryCluster != null) { + fw.write(df.format(node.getValue())); + fw.write(" from cluster(" + boundaryCluster.getInstances().size() + "): "); + fw.write(boundaryCluster.getInstances().get(0).getTitle()); + fw.write("\n"); + } + } + fw.write("\n\n"); + } + fw.close(); + } + + private void index(String path, IdxReader idxReader, IndexWriter iw, Set documentIdentities) throws IOException { + for (File file : new File(path).listFiles(new FilenameFilter() { + public boolean accept(File file, String string) { + return string.endsWith(".idx"); + } + })) { + documentFactory(idxReader, iw, file, documentIdentities); + } + } + + private Map analyzerByStemmerLanguage = new HashMap(); + + private Analyzer analyzerFactory(final String snowballStemmerName, final String language) { + return new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream ts; + ts = new StandardTokenizer(reader); + ts = new LowerCaseFilter(ts); + if (fieldName.endsWith("/stemmed")) { + ts = new SnowballFilter(ts, snowballStemmerName); + } + ts = new MarkovChainTokenFilter(ts, 1, 2); + return ts; + } + }; + } + + protected void setUp() throws Exception { + analyzerByStemmerLanguage.put("en", analyzerFactory("English", "en")); + analyzerByStemmerLanguage.put("no", analyzerFactory("Norwegian", "no")); + analyzerByStemmerLanguage.put("ne", analyzerFactory("Norwegian", "no")); + analyzerByStemmerLanguage.put("sv", analyzerFactory("Swedish", "sv")); + analyzerByStemmerLanguage.put("dk", analyzerFactory("Danish", "dk")); + analyzerByStemmerLanguage.put("fi", analyzerFactory("Finnish", "fi")); + + } + + public void documentFactory(IdxReader idxReader, IndexWriter iw, File file, Set documentIdentities) throws IOException { + + Reader reader = new InputStreamReader(new FileInputStream(file), "ISO8859-1"); + try { + idxReader.parse(reader); + } catch (IOException e) { + reader.close(); + System.out.println(e.getMessage()); + System.out.println(file); + reader.close(); + return; + } + reader.close(); + + if (!idxReader.getFieldValues().containsKey(IdxReader.DRETITLE_FIELD)) { + return; + } + + String language = idxReader.getFieldValues().get("Language"); + if (language == null) { + return; + } + + language = language.trim().toLowerCase(); + + if (!"en".equals(language)) { + return; + } + + documentIdentities.add(new Term("id", file.getName())); + + Document doc = new Document(); + doc.add(new Field("id", file.getName(), Field.Store.NO, Field.Index.NO_NORMS)); + doc.add(new Field("title", idxReader.getFieldValues().get(IdxReader.DRETITLE_FIELD), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("body/stemmed", idxReader.getFieldValues().get(IdxReader.DRECONTENT_FIELD), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("language", language, Field.Store.YES, Field.Index.NO_NORMS, Field.TermVector.NO)); + + Analyzer analyzer = analyzerByStemmerLanguage.get(language); + if (analyzer == null) { + throw new RuntimeException(); + } + + iw.addDocument(doc, analyzer); + + } + +} Index: contrib/clusterer/src/test/org/apache/lucene/clusterer/TestTanimotoSimilarity.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/clusterer/TestTanimotoSimilarity.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/clusterer/TestTanimotoSimilarity.java (revision 0) @@ -0,0 +1,122 @@ +package org.apache.lucene.clusterer; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.TermVectorAccessor; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.util.Collections; +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * + */ +public class TestTanimotoSimilarity extends TestCase { + + public void test() throws Exception { + + Directory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Collections.EMPTY_SET), true); + + Document doc; + + doc = new Document(); + doc.add(new Field("title", "introduction", Field.Store.NO, Field.Index.TOKENIZED)); + doc.add(new Field("body", "hi, i've got an appointment with mr. ullman. my name is jack torrance.", Field.Store.NO, Field.Index.TOKENIZED)); + iw.addDocument(doc); + + + doc = new Document(); + doc.add(new Field("title", "cabin fever in room 237", Field.Store.NO, Field.Index.TOKENIZED)); + doc.add(new Field("body", "all work and no play makes jack a dull boy.", Field.Store.NO, Field.Index.TOKENIZED)); + iw.addDocument(doc); + + + doc = new Document(); + doc.add(new Field("title", "warning", Field.Store.NO, Field.Index.TOKENIZED)); + doc.add(new Field("body", "jack will become a boring type if he has to work too much and is not given time to play!", Field.Store.NO, Field.Index.TOKENIZED)); + iw.addDocument(doc); + + + iw.close(); + + TanimotoSimilarity similarity = new TanimotoSimilarity(); + TermVectorAccessor termVectorAccessor = new TermVectorAccessor(); + + + IndexReader ir = IndexReader.open(dir); + + double d00 = distance(ir, 0, 0, new String[]{"body"}, termVectorAccessor, similarity); + double d01 = distance(ir, 0, 1, new String[]{"body"}, termVectorAccessor, similarity); + double d02 = distance(ir, 0, 2, new String[]{"body"}, termVectorAccessor, similarity); + + double d10 = distance(ir, 1, 0, new String[]{"body"}, termVectorAccessor, similarity); + double d11 = distance(ir, 1, 1, new String[]{"body"}, termVectorAccessor, similarity); + double d12 = distance(ir, 1, 2, new String[]{"body"}, termVectorAccessor, similarity); + + double d20 = distance(ir, 2, 0, new String[]{"body"}, termVectorAccessor, similarity); + double d21 = distance(ir, 2, 1, new String[]{"body"}, termVectorAccessor, similarity); + double d22 = distance(ir, 2, 2, new String[]{"body"}, termVectorAccessor, similarity); + + assertTrue(d00 == 0d); + assertTrue(d00 < d01); + assertTrue(d01 < d02); + + assertTrue(d11 == 0d); + assertTrue(d11 < d10); + assertTrue(d12 < d10); + + assertTrue(d22 == 0d); + assertTrue(d22 < d21); + assertTrue(d21 < d20); + + + ir.close(); + dir.close(); + + } + + + + /** + * + * @param indexReader + * @param documentNumber0 + * @param documentNumber1 + * @param fields + * @param termVectorAccessor + * @return 0 for perfect match, > 0 for greater distance + * @throws java.io.IOException + */ + private double distance(IndexReader indexReader, int documentNumber0, int documentNumber1, String[] fields, TermVectorAccessor termVectorAccessor, TanimotoSimilarity similarity) throws IOException { + AppendingVectorSpaceFrequencyMapper[] mappers = new AppendingVectorSpaceFrequencyMapper[]{new AppendingVectorSpaceFrequencyMapper(), new AppendingVectorSpaceFrequencyMapper()}; + for (String field : fields) { + termVectorAccessor.accept(indexReader, documentNumber0, field, mappers[0]); + termVectorAccessor.accept(indexReader, documentNumber1, field, mappers[1]); + } + + return similarity.distance(mappers[0].getTermFrequencyVectors(), mappers[1].getTermFrequencyVectors()); + } + + +} Index: contrib/clusterer/src/test/org/apache/lucene/clusterer/TestMarkovChain.java =================================================================== --- contrib/clusterer/src/test/org/apache/lucene/clusterer/TestMarkovChain.java (revision 0) +++ contrib/clusterer/src/test/org/apache/lucene/clusterer/TestMarkovChain.java (revision 0) @@ -0,0 +1,148 @@ +package org.apache.lucene.clusterer; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.StringReader; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-15 + * Time: 03:08:33 + */ +public class TestMarkovChain extends TestCase { + + public void test() throws Exception { + + MarkovChain markovChain = new MarkovChain(); + + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Baghdad car bomb kills 5")); + markovChain.train(tokenize("Baghdad Bombing Kills Five")); + markovChain.train(tokenize("British journalist injured in Baghdad bomb")); + markovChain.train(tokenize("Baghdad Car Bomb Kills Five, Wounds Scores")); + markovChain.train(tokenize("Deadly Baghdad bomb halts New Year's celebrations")); + markovChain.train(tokenize("Bomb at restaurant kills 5, wounds 35")); + markovChain.train(tokenize("Bombers breach Baghdad security as blast rips through restaurant")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("5 Die in New Year's Car Bomb Blast in Iraq")); + markovChain.train(tokenize("5 die in New Year's car bomb blast in Iraq")); + markovChain.train(tokenize("5 Die in New Year's Car Bomb Blast in Iraq")); + markovChain.train(tokenize("8 Die in New Year's Car Bomb Blast in Iraq")); + markovChain.train(tokenize("Death toll rises to eight in Baghdad restaurant blast")); + markovChain.train(tokenize("8 Die in New Year's Car Bomb Blast in Iraq")); + markovChain.train(tokenize("Toll rises to eight in Iraq bombing")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Car Bomb Wrecks Baghdad Restaurant, Kills 5 Iraqis")); + markovChain.train(tokenize("Car Bomb Kills 5 at Baghdad Restaurant")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Car Bomb Wrecks Baghdad Restaurant, Kills 5 Iraqis")); + markovChain.train(tokenize("US troops fear more guerrilla attacks in Iraq")); + markovChain.train(tokenize("Five killed and 25 injured by suicide car bomber at Baghdad restaurant")); + markovChain.train(tokenize("5 DEAD IN BAGHDAD BLAST")); + markovChain.train(tokenize("Car bomb kills 5 at Baghdad restaurant")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("5 killed in Baghdad restaurant bombing")); + markovChain.train(tokenize("Bomb blast in Baghdad restaurant, five killed")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Car Bomb at Baghdad Restaurant Kills Five")); + markovChain.train(tokenize("Baghdad restaurant bomb toll rises")); + markovChain.train(tokenize("Blast in Baghdad restaurant, 5 killed")); + markovChain.train(tokenize("Blast in Baghdad restaurant, 5 killed")); + markovChain.train(tokenize("Blast in Baghdad restaurant, 5 killed")); + markovChain.train(tokenize("Brit hurt in Baghdad blast")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Five killed in Baghdad car explosion")); + markovChain.train(tokenize("Bomb at crowded Baghdad restaurant kills at least five")); + markovChain.train(tokenize("Terrorism Threats Shadow World's New Year")); + markovChain.train(tokenize("Terrorism Threats Shadow World's New Year")); + markovChain.train(tokenize("Terrorism Threats Shadow World's New Year")); + markovChain.train(tokenize("World Rings in 2004 Amid Grim Backdrop")); + markovChain.train(tokenize("World Rings in 2004 Amid Grim Backdrop")); + markovChain.train(tokenize("Threats temper big celebration")); + markovChain.train(tokenize("World Rings in 2004 Amid Grim Backdrop")); + markovChain.train(tokenize("World Rings in 2004 Amid Grim Backdrop")); + markovChain.train(tokenize("World Celebrates 2004 Amid Grim Backdrop")); + + assertEquals(markovChain.extractProminentChainString(), "car bomb blast in baghdad restaurant kills five"); + + markovChain = new MarkovChain(); + + markovChain.train(tokenize("Revelers Ring in 2004 Amid Heavy Security")); + markovChain.train(tokenize("Crowds Ring in New Year in Times Square")); + markovChain.train(tokenize("Revelers ring in 2004 amid heavy security")); + markovChain.train(tokenize("2004 arrives amid heavy security")); + markovChain.train(tokenize("Revelers Ring in 2004 Amid Heavy Security")); + markovChain.train(tokenize("2004 arrives amid heavy security")); + markovChain.train(tokenize("Revelers Ring in 2004 Amid Heavy Security")); + markovChain.train(tokenize("Revelers Greet 2004 Amid Heavy Security")); + markovChain.train(tokenize("Revelers Ring in 2004 Amid Heavy Security")); + markovChain.train(tokenize("Times Square Ball Starts 2004 Celebrations")); + markovChain.train(tokenize("Crowds Ring in New Year in Times Square")); + markovChain.train(tokenize("Under heavily armed guard, New York rings in 2004 with dropping of Times Square ..")); + markovChain.train(tokenize("Under heavily armed guard, New York rings in 2004 with dropping of Times Square ..")); + markovChain.train(tokenize("Crowds Ring in New Year in Times Square")); + markovChain.train(tokenize("High Security Marks U.S. New Year's Eve")); + markovChain.train(tokenize("U.S. to Mark New Year Under High Security")); + markovChain.train(tokenize("High security marks U.S. New Year's Eve")); + markovChain.train(tokenize("U.S. Marks New Year's With High Security")); + markovChain.train(tokenize("High Security Marks U.S. New Year's Eve")); + markovChain.train(tokenize("America Readies to Ring in the New Year")); + markovChain.train(tokenize("U.S. Marks New Year's With High Security")); + markovChain.train(tokenize("World Welcomes 2004 Amid Terror Fears")); + markovChain.train(tokenize("U.S. New Year's Bashes See High Security")); + markovChain.train(tokenize("High Security Marks U.S. New Year's Eve")); + markovChain.train(tokenize("U.S. Marks New Year's With High Security")); + markovChain.train(tokenize("High Security Marks U.S. New Year's Eve")); + markovChain.train(tokenize("America Counts Down to Start of New Year")); + markovChain.train(tokenize("High Security Marks U.S. New Year's Eve")); + markovChain.train(tokenize("Under heavily armed guard, America prepares for New Year's Eve celebrations")); + markovChain.train(tokenize("Security Tightens Ahead of New Year's Celebrations (Update3)")); + markovChain.train(tokenize("Tight Security Set for U.S. New Year's Eve Parties")); + markovChain.train(tokenize("New year begins under tight security")); + markovChain.train(tokenize("US snipers on New Year's look out")); + markovChain.train(tokenize("US snipers on New Year's look out")); + markovChain.train(tokenize("US snipers on New Year's look out")); + markovChain.train(tokenize("Across the nation, stringent security is part of scene")); + markovChain.train(tokenize("U.S. New Year's Bashes See High Security")); + markovChain.train(tokenize("Celebrations and Precautions")); + markovChain.train(tokenize("Security Tightens Ahead of New Year's Celebrations (Update2)")); + markovChain.train(tokenize("US snubs terror to celebrate")); + markovChain.train(tokenize("Heavy Security for Global New Year Celebrations")); + markovChain.train(tokenize("Heavy Security for Global New Year Celebrations")); + + assertEquals(markovChain.extractProminentChainString(), "high security marks u.s. new year's eve"); + + + } + + private TokenStream tokenize(String input) { + TokenStream ts = new StandardTokenizer(new StringReader(input)); + return new LowerCaseFilter(ts); + } + + +} Index: contrib/clusterer/src/java/org/apache/lucene/analysis/MarkovChainTokenFilter.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/analysis/MarkovChainTokenFilter.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/analysis/MarkovChainTokenFilter.java (revision 0) @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.LinkedList; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Creates a stream of tokens merged together, + * ngrams on a word level, rather than charachter level. + * + * Quite useful for text mining purposes. + */ +public class MarkovChainTokenFilter extends TokenFilter { + + private int minimumSize = 1; + private int maximumSize = 2; + + public MarkovChainTokenFilter(TokenStream input) { + super(input); + } + + public MarkovChainTokenFilter(TokenStream input, int minimumSize, int maximumSize) { + super(input); + this.minimumSize = minimumSize; + this.maximumSize = maximumSize; + } + + private LinkedList/**/ inputBuffer = new LinkedList/**/(); + private LinkedList/**/ outputBuffer = new LinkedList/**/(); + + public Token next(Token result) throws IOException { + + if (inputBuffer == null) { + return null; + } + + if (outputBuffer.size() == 0) { + + while (inputBuffer.size() < maximumSize) { + Token token = input.next(); + if (token != null) { + inputBuffer.addLast(token); + } else { + break; + } + } + + if (inputBuffer.size() == 0 && outputBuffer.size() == 0) { + inputBuffer = null; + outputBuffer = null; + return null; + } + + for (int chainSize = minimumSize; chainSize <= maximumSize && chainSize <= inputBuffer.size(); chainSize++) { + if (chainSize == 1) { + outputBuffer.add(inputBuffer.get(0)); + } else { + + Token chainToken = new Token(); + + int termLength = chainSize - 1; // whitespace between terms + for (int i = 0; i < chainSize; i++) { + termLength += ((Token) inputBuffer.get(i)).termLength(); + } + + char[] termText = new char[termLength]; + int termTextOffset = 0; + for (int i = 0; i < chainSize; i++) { + Token token = (Token) inputBuffer.get(i); + System.arraycopy(token.termBuffer, 0, termText, termTextOffset, token.termLength); + termTextOffset += token.termLength(); + if (i == 0) { + chainToken.setStartOffset(token.startOffset()); + } + if (i == chainSize - 1) { + chainToken.setEndOffset(token.endOffset()); + } else { + termText[termTextOffset] = ' '; + termTextOffset++; + } + } + chainToken.setTermBuffer(termText, 0, termText.length); + chainToken.setType("Markov chain"); + outputBuffer.add(chainToken); + } + } + + inputBuffer.removeFirst(); + } + + return (Token) outputBuffer.removeFirst(); + + } + +} Index: contrib/clusterer/src/java/org/apache/lucene/index/TokenStreamTermVectorMapper.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/index/TokenStreamTermVectorMapper.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/index/TokenStreamTermVectorMapper.java (revision 0) @@ -0,0 +1,80 @@ +package org.apache.lucene.index; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-11 + * Time: 15:13:40 + */ +public class TokenStreamTermVectorMapper extends TermVectorMapper { + + + public TokenStreamTermVectorMapper() { + this(false); + } + + public TokenStreamTermVectorMapper(boolean ignoringOffset) { + super(false, ignoringOffset); + } + + private Token[] tokens; + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + this.tokens = new Token[numTerms]; + if (!storePositions) { + throw new RuntimeException("Term positions required"); + } + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + for (int i = 0; i < positions.length; i++) { + Token token = new Token(); + token.setTermBuffer(term.toCharArray(), 0, term.length()); + tokens[positions[i]] = token; + if (offsets != null) { + token.setStartOffset(offsets[i].getStartOffset()); + token.setStartOffset(offsets[i].getEndOffset()); + } + } + } + + public TokenStream tokenStreamFactory() { + return new TokenStream() { + int pos = 0; + + public Token next() throws IOException { + return next(null); + } + + public Token next(Token result) throws IOException { + if (pos == tokens.length) { + return null; + } + return tokens[pos++]; + } + + public void reset() throws IOException { + pos = 0; + } + }; + } +} Index: contrib/clusterer/src/java/org/apache/lucene/index/TermVectorAccessor.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/index/TermVectorAccessor.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/index/TermVectorAccessor.java (revision 0) @@ -0,0 +1,156 @@ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Transparent access to the vector space model, either via TermFreqVector or the inverted index. + * + * Warning! This class is not thread safe! + */ +public class TermVectorAccessor { + + public TermVectorAccessor() { + } + + /** Instance reused to save garbage collector some time */ + private TermVectorMapperDecorator decoratedMapper = new TermVectorMapperDecorator(); + + + /** + * Visits the TermVectorMapper and populates it with terms available for documentNumber, + * either via the second level cache or by resolving them from the inverted index. + * + * @param indexReader + * @param documentNumber + * @param field + * @param mapper + * @throws IOException + */ + public void accept(IndexReader indexReader, int documentNumber, String field, TermVectorMapper mapper) throws IOException { + + field = field.intern(); + + decoratedMapper.decorated = mapper; + decoratedMapper.level2Chached = false; + + try { + indexReader.getTermFreqVector(documentNumber, field, decoratedMapper); + } catch (IOException ioe) { + // ignore exception stating there is no term vector stored. + // todo: consider implementing NoTermFreqVectorAvailableException + } + + if (!decoratedMapper.level2Chached) { + build(indexReader, field, mapper, documentNumber); + } + } + + /** Instance reused to save garbage collector some time */ + private List/**/ tokens = new ArrayList/**/(500); + + /** Instance reused to save garbage collector some time */ + private List/**/ positions = new ArrayList/**/(500); + + /** Instance reused to save garbage collector some time */ + private List/**/ frequencies = new ArrayList/**/(500); + + + /** + * Visits the TermVectorMapper and populates it with terms available for documentNumber + * by resolving the inverted index. + * + * @param indexReader + * @param field + * @param mapper + * @param documentNumber + * @throws IOException + */ + private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException { + + tokens.clear(); + frequencies.clear(); + positions.clear(); + + TermEnum termEnum = indexReader.terms(); + if (termEnum.skipTo(new Term(field, ""))) { + + while (termEnum.term().field() == field) { + TermPositions termPositions = indexReader.termPositions(termEnum.term()); + if (termPositions.skipTo(documentNumber)) { + + frequencies.add(termPositions.freq()); + tokens.add(termEnum.term().text()); + + + if (!mapper.isIgnoringPositions()) { + int[] positions = new int[termPositions.freq()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = termPositions.nextPosition(); + } + this.positions.add(positions); + } else { + positions.add(null); + } + } + termPositions.close(); + if (!termEnum.next()) { + break; + } + } + + mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); + for (int i = 0; i < tokens.size(); i++) { + mapper.map((String)tokens.get(i), (Integer)frequencies.get(i), null, (int[])positions.get(i)); + } + + } + termEnum.close(); + + + } + + + private static class TermVectorMapperDecorator extends TermVectorMapper { + + private TermVectorMapper decorated; + + public boolean isIgnoringPositions() { + return decorated.isIgnoringPositions(); + } + + public boolean isIgnoringOffsets() { + return decorated.isIgnoringOffsets(); + } + + private boolean level2Chached = false; + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + decorated.setExpectations(field, numTerms, storeOffsets, storePositions); + level2Chached = true; + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + decorated.map(term, frequency, offsets, positions); + level2Chached = true; // todo: this might be unnecessary + } + + } + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Tree.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Tree.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Tree.java (revision 0) @@ -0,0 +1,338 @@ +package org.apache.lucene.clusterer.tree; + +import org.apache.lucene.clusterer.AppendingVectorSpaceFrequencyMapper; +import org.apache.lucene.clusterer.HacGqfTermReducer; +import org.apache.lucene.clusterer.TanimotoSimilarity; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermVectorAccessor; +import org.apache.lucene.search.Similarity; + +import java.io.IOException; +import java.io.Writer; +import java.text.DecimalFormat; +import java.util.*; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Two-dimensional decision tree + */ +public class Tree { + + private int maxTermsPerFieldThreadshold = 1000; + private int maxTermsPerFieldPruned = 1000; + private final String[] fields; + + private TanimotoSimilarity similarity = new TanimotoSimilarity(); + + + public Tree(String[] fields) { + this.fields = fields; + root = new Root(); + } + + private Root root; + private Set leafs = new HashSet(1000); + private Set instances = new HashSet(1000); + + + public int findDocumentNumberByDocumentIdentity(IndexReader indexReader, Term documentIdentity) throws IOException { + TermDocs termDocs = indexReader.termDocs(documentIdentity); + if (!termDocs.next()) { + throw new IOException(); + } + int ret = termDocs.doc(); + termDocs.close(); + return ret; + } + + + public Map> calculateMeanVectorSpace(Map>... vectorSpaces) { + return calculateMeanVectorSpace(maxTermsPerFieldThreadshold, maxTermsPerFieldPruned, vectorSpaces); + } + + public Map> calculateMeanVectorSpace(int maxTermsPerFieldThreadshold, int maxTermsPerFieldPruned, Map>... vectorSpaces) { + + // add + Map> meanVectorSpace = new HashMap>(); + for (Map> vectorSpace : vectorSpaces) { + for (Map.Entry> field : vectorSpace.entrySet()) { + Map meanTF = meanVectorSpace.get(field.getKey()); + if (meanTF == null) { + meanTF = new HashMap(field.getValue().size() * vectorSpaces.length); + meanVectorSpace.put(field.getKey(), meanTF); + } + for (Map.Entry tf : field.getValue().entrySet()) { + Double frequency = meanTF.get(tf.getKey()); + if (frequency == null) { + frequency = 0d; + } + meanTF.put(tf.getKey(), frequency + tf.getValue()); + } + } + } + + // prune + for (Map.Entry> field : meanVectorSpace.entrySet()) { + if (field.getValue().size() > maxTermsPerFieldThreadshold) { + List> orderd = new ArrayList>(field.getValue().entrySet()); + Collections.sort(orderd, new Comparator>() { + public int compare(Map.Entry entry, Map.Entry entry1) { + return entry.getValue().compareTo(entry1.getValue()); + } + }); + for (int i = 0; i < orderd.size() - maxTermsPerFieldPruned; i++) { + field.getValue().remove(orderd.get(i).getKey()); + } + } + } + + // divide + for (Map.Entry> field : meanVectorSpace.entrySet()) { + for (Map.Entry tf : field.getValue().entrySet()) { + tf.setValue(tf.getValue() / vectorSpaces.length); + } + } + + + return meanVectorSpace; + } + + private HacGqfTermReducer termReducer = new HacGqfTermReducer(); + + public Instance insertFromRoot(IndexReader indexReader, Term instanceIdentity) throws IOException { + return insertFromRoot(indexReader, instanceIdentity, null, null); + } + + public Instance insertFromRoot(IndexReader indexReader, Term instanceIdentity, Writer insertWriter, Writer rebuildWriter) throws IOException { + + TermVectorAccessor termVectorAccessor = new TermVectorAccessor(); + AppendingVectorSpaceFrequencyMapper mapper = new AppendingVectorSpaceFrequencyMapper(); + + int documentNumber = findDocumentNumberByDocumentIdentity(indexReader, instanceIdentity); + for (String field : fields) { + termVectorAccessor.accept(indexReader, documentNumber, field, mapper); + if (mapper.getTermFrequencyVectors().containsKey(field)) { + double norm = Similarity.decodeNorm(indexReader.norms(field)[documentNumber]); + for (Iterator> it = mapper.getTermFrequencyVectors().get(field).entrySet().iterator(); it.hasNext();) + { + Map.Entry tf = it.next(); + if (!termReducer.accept(indexReader, new Term(field, tf.getKey()))) { + it.remove(); + } else { + tf.setValue(tf.getValue() * norm); + } + } + } + } + + Map> instanceVectorSpace = mapper.getTermFrequencyVectors(); + + Instance instance = new Instance(instanceIdentity, instanceVectorSpace); + + insertFromRoot(instance, insertWriter, rebuildWriter); + + + return instance; + } + + + public void insertFromRoot(Instance instance) throws IOException { + insertFromRoot(instance, null, null); + } + + public void insertFromRoot(Instance instance, Writer insertWriter, Writer rebuildWriter) throws IOException { + + + long ms = System.currentTimeMillis(); + + + if (leafs.size() == 0) { + + Leaf leaf = new Leaf(instance); + instance.setLeaf(leaf); + leafs.add(leaf); + + root.getChildren()[0] = leaf; + leaf.setParent(root); + + } else if (leafs.size() == 1) { + + Leaf leaf = new Leaf(instance); + instance.setLeaf(leaf); + leafs.add(leaf); + + root.getChildren()[1] = leaf; + leaf.setParent(root); + + root.setVectorSpace(calculateMeanVectorSpace(root.getChildren()[0].getVectorSpace(), root.getChildren()[1].getVectorSpace())); + double distance = similarity.distance(root.getChildren()[0].getVectorSpace(), root.getChildren()[1].getVectorSpace()); + root.setDistanceBetweenChildren(distance); + + } else { + + Leaf closestLeaf; + + BranchedNode brachedNode = root; + double distanceToClosestChild; + + + while (true) { + + int closestChild = -1; + distanceToClosestChild = Double.MAX_VALUE; + for (int i = 0; i < brachedNode.getChildren().length; i++) { + double distanceToCurrentChild = similarity.distance(instance.getVectorSpace(), brachedNode.getChildren()[i].getVectorSpace()); + if (distanceToCurrentChild <= distanceToClosestChild) { + distanceToClosestChild = distanceToCurrentChild; + closestChild = i; + } + } + + if (brachedNode.getChildren()[closestChild] instanceof Leaf) { + closestLeaf = (Leaf) brachedNode.getChildren()[closestChild]; + break; + } else { + brachedNode = (BranchedNode) brachedNode.getChildren()[closestChild]; + } + + } + + // the starting point for recursive mean vector space recalculation, ie from leaf parent to root. + BranchedNode tmp; + + if (distanceToClosestChild <= 0.2d) { + // add instance to closest leaf + closestLeaf.getInstances().add(instance); + instance.setLeaf(closestLeaf); + + Map>[] instanceVectorSpaces = new Map[closestLeaf.getInstances().size()]; + + Iterator instances = closestLeaf.getInstances().iterator(); + for (int i = 0; i < closestLeaf.getInstances().size(); i++) { + instanceVectorSpaces[i] = instances.next().getVectorSpace(); + } + closestLeaf.setVectorSpace(calculateMeanVectorSpace(instanceVectorSpaces)); + + tmp = closestLeaf.getParent(); + + } else { + // create new branch and leaf + + Branch branch = new Branch(closestLeaf.getParent()); + int indexOfClosestLeafInItsParent = closestLeaf.getParent().indexOfChild(closestLeaf); + closestLeaf.getParent().getChildren()[indexOfClosestLeafInItsParent] = branch; + + closestLeaf.setParent(branch); + branch.getChildren()[0] = closestLeaf; + + Leaf leaf = new Leaf(instance); + instance.setLeaf(leaf); + leafs.add(leaf); + + leaf.setParent(branch); + branch.getChildren()[1] = leaf; + + tmp = branch; + + } + + if (insertWriter != null) { + insertWriter.write(String.valueOf(instances.size() + 1)); + insertWriter.write("\t"); + insertWriter.write(String.valueOf(System.currentTimeMillis() - ms)); + insertWriter.write("\n"); + } + + ms = System.currentTimeMillis(); + // re-calculate mean vectorspaces and distance between branch children all the way up to root + while (true) { + + tmp.setVectorSpace(calculateMeanVectorSpace(tmp.getChildren()[0].getVectorSpace(), tmp.getChildren()[1].getVectorSpace())); + double distance = similarity.distance(tmp.getChildren()[0].getVectorSpace(), tmp.getChildren()[1].getVectorSpace()); + + tmp.setDistanceBetweenChildren(distance); + if (tmp instanceof ChildNode) { + tmp = ((ChildNode) tmp).getParent(); + } else { + break; + } + } + + + if (rebuildWriter != null) { + rebuildWriter.write(String.valueOf(instances.size() + 1)); + rebuildWriter.write("\t"); + rebuildWriter.write(String.valueOf(System.currentTimeMillis() - ms)); + rebuildWriter.write("\n"); + } + + } + + instances.add(instance); + } + + + public Root getRoot() { + return root; + } + + public Set getLeafs() { + return leafs; + } + + + public String[] getFields() { + return fields; + } + + public Set getInstances() { + return instances; + } + + public HacGqfTermReducer getTermReducer() { + return termReducer; + } + + private static final DecimalFormat df = new DecimalFormat("#.####"); + + public void toDOT(Node node, Writer writer) throws IOException { + // todo visitor + + if (node instanceof BranchedNode) { + BranchedNode branchedNode = (BranchedNode) node; + writer.write("\"" + branchedNode.toString() + "\" [label=\""); + writer.write(df.format(branchedNode.getDistanceBetweenChildren())); + writer.write("\"];\n"); + writer.write("\"" + branchedNode.toString() + "\" -> \"" + branchedNode.getChildren()[0].toString() + "\";\n"); + writer.write("\"" + branchedNode.toString() + "\" -> \"" + branchedNode.getChildren()[1].toString() + "\";\n"); + toDOT(branchedNode.getChildren()[0], writer); + toDOT(branchedNode.getChildren()[1], writer); + } else if (node instanceof Leaf) { + Leaf leaf = (Leaf) node; + + writer.write("\"" + leaf.toString() + "\" [label=\""); + writer.write(leaf.getInstances().size()); + writer.write("\"];\n"); + + } + + } + + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeCluster.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeCluster.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeCluster.java (revision 0) @@ -0,0 +1,60 @@ +package org.apache.lucene.clusterer.tree; + +import java.util.List; +import java.util.ArrayList; +import java.util.Map; +import java.util.HashMap; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 23:55:23 + */ +public class TreeCluster { + + private List instances = new ArrayList(); + private List nodes = new ArrayList(); + + private Map boundaries = new HashMap(); + + + public List getInstances() { + return instances; + } + + public void setInstances(List instances) { + this.instances = instances; + } + + + public Map getBoundaries() { + return boundaries; + } + + public void setBoundaries(Map boundaries) { + this.boundaries = boundaries; + } + + public List getNodes() { + return nodes; + } + + public void setNodes(List nodes) { + this.nodes = nodes; + } +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Root.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Root.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Root.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.clusterer.tree; + +import java.util.Map; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * The root of the tree. May contain children but no parent. + */ +public class Root implements BranchedNode { + + private final ChildNode[] children = new ChildNode[2]; + private Double distanceBetweenChildren; + private Map> vectorSpace; + + + public Map> getVectorSpace() { + return vectorSpace; + } + + public void setVectorSpace(Map> vectorSpace) { + this.vectorSpace = vectorSpace; + } + + public Double getDistanceBetweenChildren() { + return distanceBetweenChildren; + } + + public void setDistanceBetweenChildren(Double distanceBetweenChildren) { + this.distanceBetweenChildren = distanceBetweenChildren; + } + + public int indexOfChild(ChildNode childNode) { + for (int i = 0; i < 2; i++) { + if (childNode.equals(children[i])) { + return i; + } + } + return -1; + } + + public ChildNode[] getChildren() { + return children; + } + + + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Instance.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Instance.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Instance.java (revision 0) @@ -0,0 +1,85 @@ +package org.apache.lucene.clusterer.tree; + +import org.apache.lucene.index.Term; + +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-07 + * Time: 18:02:20 + */ +public class Instance implements Node { + + private Map> vectorSpace; + private String title; + private Term identity; + private Leaf leaf; + + + public Instance(Term identity, Map> vectorSpace) { + this.identity = identity; + this.vectorSpace = vectorSpace; + } + + public Map> getVectorSpace() { + return vectorSpace; + } + + public void setVectorSpace(Map> vectorSpace) { + this.vectorSpace = vectorSpace; + } + + public String toString() { + if (getTitle() != null) { + return getTitle() + " " + identity.text(); + } else { + return identity.text(); + } + } + + + public Term getIdentity() { + return identity; + } + + public void setIdentity(Term identity) { + this.identity = identity; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + + public Leaf getLeaf() { + return leaf; + } + + public void setLeaf(Leaf leaf) { + this.leaf = leaf; + } + + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Node.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Node.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Node.java (revision 0) @@ -0,0 +1,32 @@ +package org.apache.lucene.clusterer.tree; + +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 04:26:09 + */ +public interface Node { + + public abstract Map> getVectorSpace(); + public abstract void setVectorSpace(Map> vectorSpace); + + public abstract void accept(NodeVisitor visitor); + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/NodeVisitor.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/NodeVisitor.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/NodeVisitor.java (revision 0) @@ -0,0 +1,30 @@ +package org.apache.lucene.clusterer.tree; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-04 + * Time: 00:20:31 + */ +public interface NodeVisitor { + + public abstract void visit(Instance instance); + public abstract void visit(Leaf leaf); + public abstract void visit(Branch branch); + public abstract void visit(Root root); + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Leaf.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Leaf.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Leaf.java (revision 0) @@ -0,0 +1,90 @@ +package org.apache.lucene.clusterer.tree; + +import org.apache.lucene.index.Term; + +import java.util.Map; +import java.util.Set; +import java.util.HashSet; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A node in the tree with a parent. + * May contain an infinit number of instances, but at least one. + */ +public class Leaf implements ChildNode { + + + private BranchedNode parent; + private Map> vectorSpace; + + private Set instances; + + public Leaf(Instance instance) { + instances = new HashSet(1); + instances.add(instance); + vectorSpace = instance.getVectorSpace(); + } + + /** + * If only one instance is available, this vector space is the same + * instance as the vector space of the instance, so don't go modifying + * this value unless you know what you are doing. + * + * @return Mean vector space of all instances + */ + public Map> getVectorSpace() { + return vectorSpace; + } + + public void setVectorSpace(Map> vectorSpace) { + this.vectorSpace = vectorSpace; + } + + public ChildNode getSibling() { + if (getParent() == null) { + return null; + } else if (getParent().getChildren()[0] == this) { + return getParent().getChildren()[1]; + } else if (getParent().getChildren()[1] == this) { + return getParent().getChildren()[0]; + } else { + return null; + } + } + + public BranchedNode getParent() { + return parent; + } + + public void setParent(BranchedNode parent) { + this.parent = parent; + } + + + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } + + + public Set getInstances() { + return instances; + } + + public void setInstances(Set instances) { + this.instances = instances; + } +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeClusterer.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeClusterer.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/TreeClusterer.java (revision 0) @@ -0,0 +1,203 @@ +package org.apache.lucene.clusterer.tree; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 23:48:53 + */ +public class TreeClusterer { + + private Tree tree; + private double clusterBoundary; + + public TreeClusterer(Tree tree, double clusterBoundary) { + this.tree = tree; + this.clusterBoundary = clusterBoundary; + clusters = new HashSet(); + clusterByInstance = new HashMap(tree.getInstances().size()); + } + + private Map clusterByNode = new HashMap(); + + private Map clusterByInstance; + private Set clusters; + + public TreeCluster getCluster(Instance instance) { + + TreeCluster cluster = clusterByInstance.get(instance); + if (cluster == null) { + TreeClustererNodeVisitor nodeVisitor = new TreeClustererNodeVisitor(); + instance.accept(nodeVisitor); + cluster = nodeVisitor.getCluster(); + + for (Instance member : cluster.getInstances()) { + clusterByInstance.put(member, cluster); + } + + for (Node node : cluster.getNodes()) { + clusterByNode.put(node, cluster); + } + + clusters.add(cluster); + } + + return cluster; + } + + public void resolveAllClusters() { + for (Instance instance : tree.getInstances()) { + if (!clusterByInstance.containsKey(instance)) { + getCluster(instance); + } + } + } + + + public Map getClusterByNode() { + return clusterByNode; + } + + public Tree getTree() { + return tree; + } + + public double getClusterBoundary() { + return clusterBoundary; + } + + + public Map getClusterByInstance() { + return clusterByInstance; + } + + public Set getClusters() { + return clusters; + } + + private class TreeClustererNodeVisitor implements NodeVisitor { + private Set visitedNodes = new HashSet(); + private TreeCluster cluster = new TreeCluster(); + + + public void visit(Instance instance) { + if (clusterByInstance.containsKey(instance)) { + throw new RuntimeException("This should never occcur!"); + } + + visitedNodes.add(instance); + cluster.getInstances().add(instance); + + if (!visitedNodes.contains(instance.getLeaf())) { + instance.getLeaf().accept(this); + } + } + + public void visit(Leaf leaf) { + + visitedNodes.add(leaf); + cluster.getNodes().add(leaf); + + + for (Instance instance : leaf.getInstances()) { + if (!visitedNodes.contains(instance)) { + instance.accept(this); + } + } + + Node sibling = leaf.getSibling(); + if (!visitedNodes.contains(sibling)) { + if (clusterBoundary >= leaf.getParent().getDistanceBetweenChildren()) { + sibling.accept(this); + } else { + visitedNodes.add(sibling); + cluster.getBoundaries().put(sibling, leaf.getParent().getDistanceBetweenChildren()); + } + } + + if (!visitedNodes.contains(leaf.getParent())) { + visitedNodes.add(leaf.getParent()); + if (clusterBoundary >= leaf.getParent().getDistanceBetweenChildren()) { + leaf.getParent().accept(this); + } else { + cluster.getBoundaries().put(leaf.getParent(), leaf.getParent().getDistanceBetweenChildren()); + } + } + + } + + public void visit(Branch branch) { + + visitedNodes.add(branch); + cluster.getNodes().add(branch); + + + for (int i = 0; i < 2; i++) { + if (!visitedNodes.contains(branch.getChildren()[i])) { + if (clusterBoundary >= branch.getDistanceBetweenChildren()) { + branch.getChildren()[i].accept(this); + } else { + visitedNodes.add(branch.getChildren()[i]); + cluster.getBoundaries().put(branch.getChildren()[i], branch.getDistanceBetweenChildren()); + } + } + } + + + if (!visitedNodes.contains(branch.getParent())) { + visitedNodes.add(branch.getParent()); + if (clusterBoundary >= branch.getParent().getDistanceBetweenChildren()) { + branch.getParent().accept(this); + } else { + cluster.getBoundaries().put(branch.getParent(), branch.getParent().getDistanceBetweenChildren()); + } + } + + } + + public void visit(Root root) { + + visitedNodes.add(root); + + for (int i = 0; i < 2; i++) { + if (!visitedNodes.contains(root.getChildren()[i])) { + if (clusterBoundary >= root.getDistanceBetweenChildren()) { + root.getChildren()[i].accept(this); + } else { + visitedNodes.add(root.getChildren()[i]); + cluster.getBoundaries().put(root.getChildren()[i], root.getDistanceBetweenChildren()); + } + } + } + } + + + public Set getVisitedNodes() { + return visitedNodes; + } + + public TreeCluster getCluster() { + return cluster; + } + } +} + Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/ChildNode.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/ChildNode.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/ChildNode.java (revision 0) @@ -0,0 +1,27 @@ +package org.apache.lucene.clusterer.tree; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A node in the tree that has a parent. + */ +public interface ChildNode extends Node { + + public abstract BranchedNode getParent(); + + public abstract ChildNode getSibling(); + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/BranchedNode.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/BranchedNode.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/BranchedNode.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.clusterer.tree; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 04:26:23 + */ +public interface BranchedNode extends Node { + + public abstract ChildNode[] getChildren(); + + public int indexOfChild(ChildNode childNode); + + public Double getDistanceBetweenChildren(); + public void setDistanceBetweenChildren(Double distance); + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Branch.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Branch.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/tree/Branch.java (revision 0) @@ -0,0 +1,92 @@ +package org.apache.lucene.clusterer.tree; + +import java.util.Map; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A node in the tree with a parent and children + */ +public class Branch implements ChildNode, BranchedNode { + + private BranchedNode parent; + private final ChildNode[] children = new ChildNode[2]; + + private Map> vectorSpace; + + + public Map> getVectorSpace() { + return vectorSpace; + } + + public void setVectorSpace(Map> vectorSpace) { + this.vectorSpace = vectorSpace; + } + + private Double distanceBetweenChildren; + + public Branch(BranchedNode parent) { + this.parent = parent; + } + + + public Double getDistanceBetweenChildren() { + return distanceBetweenChildren; + } + + public void setDistanceBetweenChildren(Double distanceBetweenChildren) { + this.distanceBetweenChildren = distanceBetweenChildren; + } + + public int indexOfChild(ChildNode childNode) { + for (int i = 0; i < 2; i++) { + if (childNode.equals(children[i])) { + return i; + } + } + return -1; + } + + public ChildNode getSibling() { + if (getParent() == null) { + return null; + } else if (getParent().getChildren()[0] == this) { + return getParent().getChildren()[1]; + } else if (getParent().getChildren()[1] == this) { + return getParent().getChildren()[0]; + } else { + return null; + } + } + + public ChildNode[] getChildren() { + return children; + } + + public BranchedNode getParent() { + return parent; + } + + public void setParent(BranchedNode parent) { + this.parent = parent; + } + + + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/TanimotoSimilarity.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/TanimotoSimilarity.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/TanimotoSimilarity.java (revision 0) @@ -0,0 +1,114 @@ +package org.apache.lucene.clusterer; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Tanimoto coefficient implementation. + * + * http://en.wikipedia.org/wiki/Jaccard_index + * + * Warning, this class is not thread safe! + * + */ +public class TanimotoSimilarity { + + + private double ab; + private double a2; + private double b2; + + private Map> acopy = new HashMap>(); + private Map> bcopy = new HashMap>(); + + + /** + * Calculates the difference between two vector spaces. + * + * @param vectorSpace0 + * @param vectorSpace1 + * @return 0 for perfect match, > 0 for greater distance + * @throws IOException + */ + public double distance(Map> vectorSpace0, Map> vectorSpace1) throws IOException { + ab = 0d; + a2 = 0d; + b2 = 0d; + + deepCopy(vectorSpace0, acopy); + deepCopy(vectorSpace1, bcopy); + + calculate(acopy, bcopy); + calculate(bcopy, acopy); + + return ((a2 + b2 - ab) / ab) - 1; + } + + + /** + * @param vectorSpace0 + * @param vectorSpace1 + */ + private void calculate(Map> vectorSpace0, Map> vectorSpace1) { + for (Iterator>> fieldEntries = vectorSpace0.entrySet().iterator(); fieldEntries.hasNext();) + { + Map.Entry> fieldEntry = fieldEntries.next(); + Map termFrequencies = vectorSpace1.get(fieldEntry.getKey()); + + for (Iterator> termFrequenciesEntries = fieldEntry.getValue().entrySet().iterator(); termFrequenciesEntries.hasNext();) + { + Map.Entry termFrequencyEntry = termFrequenciesEntries.next(); + Double termFrequency; + if (termFrequencies != null) { + termFrequency = termFrequencies.remove(termFrequencyEntry.getKey()); + if (termFrequency == null) { + termFrequency = 0d; + } + } else { + termFrequency = 0d; + } + + ab += termFrequencyEntry.getValue() * termFrequency; + a2 += termFrequencyEntry.getValue() * termFrequencyEntry.getValue(); + b2 += termFrequency * termFrequency; + + termFrequenciesEntries.remove(); + } + + fieldEntries.remove(); + + } + } + + + private void deepCopy(Map> source, Map> destination) { + destination.clear(); + for (Map.Entry> e : source.entrySet()) { + Map tf = new HashMap(e.getValue().size()); + for (Map.Entry tfe : e.getValue().entrySet()) { + tf.put(tfe.getKey(), tfe.getValue()); + } + destination.put(e.getKey(), tf); + } + } + + +} + Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/MarkovChain.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/MarkovChain.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/MarkovChain.java (revision 0) @@ -0,0 +1,250 @@ +package org.apache.lucene.clusterer; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.util.*; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * todo: not sure why i use doubles rather than integers. + * todo: belive i was thinking about text length normalization or something. + * + * @author karl wettin + * Date: 2007-okt-11 + * Time: 11:33:12 + */ +public class MarkovChain { + + public MarkovChain() { + } + + private Link startLink = this.new Link(); + private Link endLink = this.new Link(); + + public void train(TokenStream tokenStream) throws IOException { + Link previousLink = startLink; + + Token token; + while ((token = tokenStream.next()) != null) { + String termText = new String(token.termBuffer(), 0, token.termLength()); + Link link = getLinksByTermText().get(termText); + if (link == null) { + link = this.new Link(termText); + } + + Double weight; + + weight = link.getPreviousLinks().get(previousLink); + if (weight == null) { + weight = 1d; + } else { + weight += 1d; + } + link.getPreviousLinks().put(previousLink, weight); + + if (previousLink != null) { + weight = previousLink.getNextLinks().get(link); + if (weight == null) { + weight = 1d; + } else { + weight += 1d; + } + previousLink.getNextLinks().put(link, weight); + } + + previousLink = link; + } + + if (previousLink != null) { + Double weight = previousLink.getNextLinks().get(endLink); + if (weight == null) { + weight = 1d; + } else { + weight += 1d; + } + previousLink.getNextLinks().put(endLink, weight); + } + } + + public Link extractProminentLink() { + Map links = new HashMap(); + for (Link link : getLinksByTermText().values()) { + double weight = 0; + for (Map.Entry next : link.getNextLinks().entrySet()) { + weight += next.getValue(); + } + for (Map.Entry previous : link.getPreviousLinks().entrySet()) { + weight += previous.getValue(); + } + links.put(link, weight); + } + List> ordered = new ArrayList>(links.entrySet()); + Collections.sort(ordered, new Comparator>() { + public int compare(Map.Entry entry, Map.Entry entry1) { + return entry1.getValue().compareTo(entry.getValue()); + } + }); + return ordered.get(0).getKey(); + } + + public String extractProminentChainString() { + StringBuilder sb = new StringBuilder(100); + for (Iterator it = extractProminentChain().iterator(); it.hasNext();) { + sb.append(it.next().getTermText()); + if (it.hasNext()) { + sb.append(' '); + } + } + return sb.toString(); + } + + + /** + * todo: this is really lame. needs to look ahead in case of equal scores in lists, etc. + * todo: this code is terrible slow and in dire need of optimization. + * todo: note that a link is only allowed once. this too can be considered wrong. + * + * @return + */ + public LinkedList extractProminentChain() { + LinkedList ret = new LinkedList(); + Link link = extractProminentLink(); + ret.add(link); + extractProminentChainBackwards(ret, link); + extractProminentChainForwards(ret, link); + return ret; + } + + private void extractProminentChainBackwards(LinkedList chain, Link previousLink) { + List> ordered = new ArrayList>(previousLink.getPreviousLinks().entrySet()); + Collections.sort(ordered, new Comparator>() { + public int compare(Map.Entry entry, Map.Entry entry1) { + return entry1.getValue().compareTo(entry.getValue()); + } + }); + + Link link = startLink; + for (Map.Entry entry : ordered) { + link = entry.getKey(); + if (!chain.contains(link)) { + break; + } + } + + if (startLink != link) { + chain.addFirst(link); + extractProminentChainBackwards(chain, link); + } + + } + + + private void extractProminentChainForwards(LinkedList chain, Link previousLink) { + List> ordered = new ArrayList>(previousLink.getNextLinks().entrySet()); + Collections.sort(ordered, new Comparator>() { + public int compare(Map.Entry entry, Map.Entry entry1) { + return entry1.getValue().compareTo(entry.getValue()); + } + }); + + Link link = endLink; + for (Map.Entry entry : ordered) { + link = entry.getKey(); + if (!chain.contains(link)) { + break; + } + } + + if (endLink != link) { + chain.add(link); + extractProminentChainForwards(chain, link); + } + + } + + private Map linksByTermText = new HashMap(); + + + public Link getStartLink() { + return startLink; + } + + public Link getEndLink() { + return endLink; + } + + public Map getLinksByTermText() { + return linksByTermText; + } + + public void setLinksByTermText(Map linksByTermText) { + this.linksByTermText = linksByTermText; + } + + public class Link { + + private String termText; + + private Link() { + + } + + public Link(String termText) { + if (linksByTermText.containsKey(termText)) { + throw new IllegalArgumentException("Link with termText '" + termText + "' already exists"); + } + this.termText = termText; + linksByTermText.put(termText, this); + } + + private Map previousLinks = new HashMap(); + private Map nextLinks = new HashMap(); + + + public String getTermText() { + return termText; + } + + public void setTermText(String termText) { + this.termText = termText; + } + + public Map getPreviousLinks() { + return previousLinks; + } + + public void setPreviousLinks(Map previousLinks) { + this.previousLinks = previousLinks; + } + + public Map getNextLinks() { + return nextLinks; + } + + public void setNextLinks(Map nextLinks) { + this.nextLinks = nextLinks; + } + + + public String toString() { + return termText; + } + } + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/AppendingVectorSpaceFrequencyMapper.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/AppendingVectorSpaceFrequencyMapper.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/AppendingVectorSpaceFrequencyMapper.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.clusterer; + +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; + +import java.util.HashMap; +import java.util.Map; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-03 + * Time: 06:59:05 + */ +public class AppendingVectorSpaceFrequencyMapper extends TermVectorMapper { + + + public boolean isIgnoringPositions() { + return true; + } + + public boolean isIgnoringOffsets() { + return true; + } + + private Map> termFrequencyVectors = new HashMap>(); + private Map currentTermFrequencyVector; + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + currentTermFrequencyVector = termFrequencyVectors.get(field); + if (currentTermFrequencyVector == null) { + currentTermFrequencyVector = new HashMap(numTerms); + termFrequencyVectors.put(field, currentTermFrequencyVector); + } + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + Double val = currentTermFrequencyVector.get(term); + if (val == null) { + val = 0d; + } + currentTermFrequencyVector.put(term, val + frequency); + + } + + public Map> getTermFrequencyVectors() { + return termFrequencyVectors; + } + + public Map getCurrentTermFrequencyVector() { + return currentTermFrequencyVector; + } + + +} Index: contrib/clusterer/src/java/org/apache/lucene/clusterer/HacGqfTermReducer.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/clusterer/HacGqfTermReducer.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/clusterer/HacGqfTermReducer.java (revision 0) @@ -0,0 +1,105 @@ +package org.apache.lucene.clusterer; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.PriorityQueue; + +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Discards terms that are too common or unique to a documents. + * + * Expects all documents in the corpus to be instances of the cluster. + */ +public class HacGqfTermReducer { + + private double threadsholdTfIdf = 0.05d; + private double threadsholdDocumentPercentileFrequency = 0.25d; + + public HacGqfTermReducer() { + + } + + public HacGqfTermReducer(double threadsholdTfIdf, double threadsholdDocumentPercentileFrequency) { + this.threadsholdTfIdf = threadsholdTfIdf; + this.threadsholdDocumentPercentileFrequency = threadsholdDocumentPercentileFrequency; + } + + private Set reducedTerms = new HashSet(1000); +// private Set acceptedTerms = new HashSet(1000); + + public boolean accept(IndexReader reader, Term term) throws IOException { + if (reducedTerms.contains(term)) { + return false; + } + + TermEnum termEnum = reader.terms(term); + + double numberOfDocuments = reader.numDocs(); + double documentFrequency = termEnum.docFreq(); + + termEnum.close(); + + double documentPercentileFrequency = documentFrequency / numberOfDocuments; + + if (documentPercentileFrequency > threadsholdDocumentPercentileFrequency) { + // term available in too large portion in the corpus + System.out.println("dpf " + documentPercentileFrequency + "\t" + term); + reducedTerms.add(term); + return false; + } + + double tfidf = Math.log(numberOfDocuments / documentFrequency); + + if (tfidf == 1d) { + // term only available in one document + System.out.println("unique \t" + term); + reducedTerms.add(term); + return false; + } else if (tfidf <= threadsholdTfIdf) { + reducedTerms.add(term); + System.out.println("tfidf " + tfidf + "\t" + term); + + return false; + } else { +// acceptedTerms.add(term); + return true; + } + + } + + + public double getThreadsholdTfIdf() { + return threadsholdTfIdf; + } + + public void setThreadsholdTfIdf(double threadsholdTfIdf) { + this.threadsholdTfIdf = threadsholdTfIdf; + } + + public double getThreadsholdDocumentPercentileFrequency() { + return threadsholdDocumentPercentileFrequency; + } + + public void setThreadsholdDocumentPercentileFrequency(double threadsholdDocumentPercentileFrequency) { + this.threadsholdDocumentPercentileFrequency = threadsholdDocumentPercentileFrequency; + } +} Index: contrib/clusterer/src/java/org/apache/lucene/document/IdxReader.java =================================================================== --- contrib/clusterer/src/java/org/apache/lucene/document/IdxReader.java (revision 0) +++ contrib/clusterer/src/java/org/apache/lucene/document/IdxReader.java (revision 0) @@ -0,0 +1,113 @@ +package org.apache.lucene.document; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Utillity to read Autonomy DRE IDX document files + */ +public class IdxReader { + + public static final String DRECONTENT_FIELD = "DRECONTENT"; + private static final String DRECONTENT_HEAD = "#DRECONTENT"; + private static final String DREENDDOC = "#DREENDDOC"; + + public static final String DRETITLE_FIELD = "DRETITLE"; + private static final String DRETITLE_HEAD = "#DRETITLE"; + + private static final Pattern DREFIELD_PATTERN = Pattern.compile("^\\#DREFIELD\\s+([^=]+)\\s*=\\s*\\\"(.*)\\\"\\s*$"); + private static final Pattern DREVALUE_PATTERN = Pattern.compile("^\\#(DRE[^\\s]+)\\s+(.*)\\s*$"); + + public static final Set PARSE_ALL_FIELDS = null; + private Set fieldsToParse = PARSE_ALL_FIELDS; + private Map fieldValues = new HashMap(); + + public void parse(Reader reader) throws IOException { + parse(new BufferedReader(reader)); + } + + public void parse(BufferedReader reader) throws IOException { + + fieldValues.clear(); + + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith(DREENDDOC)) { + return; + } else if (DRETITLE_HEAD.equals(line)) { + line = reader.readLine(); + if (PARSE_ALL_FIELDS == fieldsToParse || fieldsToParse.contains(DRETITLE_FIELD)) { + fieldValues.put(DRETITLE_FIELD, line); + } + } else if (DRECONTENT_HEAD.equals(line)) { + StringBuffer buf = new StringBuffer(10000); + while ((line = reader.readLine()) != null) { + if (line.startsWith(DREENDDOC)) { + break; + } else { + buf.append(line); + buf.append("\n"); + } + } + if (PARSE_ALL_FIELDS == fieldsToParse || fieldsToParse.contains(DRECONTENT_FIELD)) { + fieldValues.put(DRECONTENT_FIELD, buf.toString()); + } + return; + } else { + Matcher macher = DREFIELD_PATTERN.matcher(line); + if (!macher.matches()) { + macher = DREVALUE_PATTERN.matcher(line); + if (!macher.matches()) { + throw new IOException("Invalid DRE IDX line: " + line); + } + } + if (PARSE_ALL_FIELDS == fieldsToParse || fieldsToParse.contains(macher.group(1))) { + String oldValue = fieldValues.put(macher.group(1), macher.group(2)); + if (oldValue != null && !oldValue.equals(macher.group(2))) { + throw new IOException("Non matching repetative fields: " + macher.group(1)); + } + } + } + } + + } + + + public Set getFieldsToParse() { + return fieldsToParse; + } + + public Map getFieldValues() { + return fieldValues; + } + + + public void setFieldsToParse(Set fieldsToParse) { + this.fieldsToParse = fieldsToParse; + } + + public void setFieldValues(Map fieldValues) { + this.fieldValues = fieldValues; + } +}