Index: lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
===================================================================
--- lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java (revision 0)
+++ lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java (revision 0)
@@ -0,0 +1,237 @@
+package org.apache.lucene.misc;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+public class TestHighFreqTerms extends LuceneTestCase {
+
+ private static IndexWriter writer =null;
+ private static MockRAMDirectory dir = null;
+
+ private static void setDir(MockRAMDirectory d){
+ dir =d;
+ }
+ private static MockRAMDirectory getDir (){
+ return dir;
+ }
+
+
+ public void setUp() throws Exception {
+ super.setUp();
+ setDir( new MockRAMDirectory());
+ writer = new IndexWriter(dir, new IndexWriterConfig(
+ TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))
+ .setMaxBufferedDocs(2));
+ indexDocs(writer);
+
+ }
+
+ //Test HighFreqTermsWithTF.getHighFreqTerms
+ public static void testFirstTermHighestDocFreq () throws Exception{
+ int numTerms = 12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+ assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
+ }
+ public static void testOrderedByDocFreqDescending () throws Exception{
+ int numTerms = 12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+ for (int i = 0; i < terms.length; i++) {
+ if (i >0){
+ assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
+ }
+ }
+ }
+
+ public static void testNumTerms () throws Exception{
+ int numTerms = 12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+ assertEquals("length of terms array equals numTerms :" + numTerms,
+ numTerms, terms.length);
+ }
+
+ public static void testGetHighFreqTerms () throws Exception{
+ int numTerms=12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+
+ for (int i = 0; i < terms.length; i++) {
+ String termtext = terms[i].termtext.utf8ToString();
+ // hardcoded highTF or highTFmedDF
+ if (termtext.contains("highTF")) {
+ if (termtext.contains("medDF")) {
+ assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
+ } else {
+ assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
+ }
+ } else {
+ int n = Integer.parseInt(termtext);
+ assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
+ terms[i].docFreq);
+ }
+ }
+ }
+
+ //Test HighFreqTermsWithTF.sortByTotalTermFreq
+
+ public static void testFirstTermHighestTotalTermFreq () throws Exception{
+ int numTerms = 12;
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+
+ TermStats[] termsWithTotalTermFreq = HighFreqTermsWithTF.sortByTotalTermFreq(reader, terms);
+ assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
+ }
+
+
+ public static void testOrderedByTermFreqDescending () throws Exception{
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ int numTerms = 12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+ TermStats[] termsWithTF = HighFreqTermsWithTF.sortByTotalTermFreq(reader, terms);
+
+ for (int i = 0; i < termsWithTF.length; i++) {
+ // check that they are sorted by descending termfreq order
+ if (i >0){
+ assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
+ }
+ }
+ }
+
+
+
+ public static void testGetTermFreqOrdered () throws Exception{
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ int numTerms = 12;
+ TermStats[] terms = getHighFreqTermsArray(numTerms);
+ TermStats[] termsWithTF = HighFreqTermsWithTF.sortByTotalTermFreq(reader, terms);
+
+ for (int i = 0; i < termsWithTF.length; i++) {
+ String text = termsWithTF[i].termtext.utf8ToString();
+ if (text.contains("highTF")) {
+ if (text.contains("medDF")) {
+ assertEquals("total term freq is expected", 125,
+ termsWithTF[i].totalTermFreq);
+ } else {
+ assertEquals("total term freq is expected", 200,
+ termsWithTF[i].totalTermFreq);
+ }
+
+ } else {
+ int n = Integer.parseInt(text);
+ assertEquals("doc freq is expected", getExpecteddocFreq(n),
+ termsWithTF[i].docFreq);
+ assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
+ termsWithTF[i].totalTermFreq);
+ }
+ }
+ reader.close();
+ }
+
+
+ public static void testGetTotalTermFreq() throws Exception{
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ String term ="highTF";
+ BytesRef termtext = new BytesRef (term);
+ String field = "field";
+ long totalTermFreq = HighFreqTermsWithTF.getTotalTermFreq(reader, field, termtext);
+ reader.close();
+ assertEquals("highTf tf should be 200",200,totalTermFreq);
+
+ }
+
+ public static void testGetTotalTermFreqBadTerm() throws Exception{
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ String term ="foobar";
+ BytesRef termtext = new BytesRef (term);
+ String field = "field";
+ long totalTermFreq = HighFreqTermsWithTF.getTotalTermFreq(reader, field, termtext);
+ reader.close();
+ assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
+
+ }
+
+/************************************************************************************************/
+
+private static TermStats[] getHighFreqTermsArray (int numTerms)throws Exception{
+ MockRAMDirectory dir = getDir();
+ IndexReader reader = IndexReader.open(dir, true);
+ String field = "field";
+ TermStats[] terms = HighFreqTermsWithTF.getHighFreqTerms(reader, numTerms, field);
+ return terms;
+
+}
+ private static void indexDocs(IndexWriter writer) throws Exception {
+
+ /**
+ * Generate 10 documents where document number n has a docFreq of n and a totalTermFreq of n*2 (squared).
+ */
+ for (int i = 1; i <= 10; i++) {
+ Document doc = new Document();
+ String content = getContent(i);
+
+ doc.add(new Field("field", content, Field.Store.YES,Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ }
+ // add some docs where tf < df so we can see if sorting works
+ // highTF low df
+ int highTF = 200;
+ Document doc = new Document();
+ String content = "";
+ for (int i = 0; i < highTF; i++) {
+ content += "highTF ";
+ }
+ doc.add(new Field("field", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ // highTF medium df =5
+ int medium_df = 5;
+ for (int i = 0; i < medium_df; i++) {
+ int tf = 25;
+ Document newdoc = new Document();
+ String newcontent = "";
+ for (int j = 0; j < tf; j++) {
+ newcontent += "highTFmedDF ";
+ }
+ newdoc.add(new Field("field", newcontent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(newdoc);
+ }
+
+ writer.close();
+
+ }
+
+/**
+ * getContent
+ * return string containing numbers 1 to i with each number n occurring n times.
+ * i.e. for input of 3 return string "3 3 3 2 2 1"
+ */
+
+ private static String getContent(int i) {
+ String s = "";
+ for (int j = 10; j >= i; j--) {
+ for (int k = 0; k < j; k++) {
+ // if j is 3 we return "3 3 3"
+ s += String.valueOf(j) + " ";
+ }
+ }
+ return s;
+ }
+
+ private static int getExpectedtotalTermFreq(int i) {
+ return getExpecteddocFreq(i) * i;
+ }
+
+ private static int getExpecteddocFreq(int i) {
+ return i;
+ }
+}
Property changes on: lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTermsWithTF.java
===================================================================
--- lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTermsWithTF.java (revision 0)
+++ lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTermsWithTF.java (revision 0)
@@ -0,0 +1,265 @@
+package org.apache.lucene.misc;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.FieldReaderException;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Bits;
+import java.io.File;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ *
+ * HighFreqTermsWithTF class extracts the top n most frequent terms
+ * (by document frequency ) from an existing Lucene index and reports both their
+ * document frequency and their total tf (total number of occurences) in order
+ * of highest total tf
+ */
+public class HighFreqTermsWithTF {
+
+ // The top numTerms will be displayed
+ public static final int DEFAULTnumTerms = 100;
+ public static int numTerms = DEFAULTnumTerms;
+
+ public static void main(String[] args) throws Exception {
+ IndexReader reader = null;
+ FSDirectory dir = null;
+ String field = null;
+
+ if (args.length == 1) {
+ dir = FSDirectory.open(new File(args[0]));
+ } else if (args.length == 2) {
+ try {
+ numTerms = Integer.parseInt(args[1]);
+ } catch (NumberFormatException e) {
+ System.err.println("second argument must be an integer");
+ usage();
+ System.exit(1);
+ }
+ dir = FSDirectory.open(new File(args[0]));
+ } else if (args.length == 3) {
+ try {
+ numTerms = Integer.parseInt(args[1]);
+ } catch (NumberFormatException e) {
+ System.err.println("second argument must be an integer");
+ usage();
+ System.exit(1);
+ }
+ field = args[2];
+ dir = FSDirectory.open(new File(args[0]));
+
+ } else {
+ usage();
+ System.exit(1);
+ }
+
+ reader = IndexReader.open(dir, true);
+ TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
+ TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
+
+ for (int i = 0; i < termsWithTF.length; i++) {
+ System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
+ termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(),
+ termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
+ }
+ reader.close();
+ }
+
+ /**
+ *
+ * @param reader
+ * @param numTerms
+ * @param field
+ * @return TermStats[] ordered by terms with highest docFreq first.
+ * @throws Exception
+ */
+ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
+ TermStatsQueue tiq = null;
+
+ if (field != null) {
+ Fields fields = MultiFields.getFields(reader);
+ if (fields == null) {
+ throw new FieldReaderException("field " + field + " not found");
+ }
+ Terms terms = fields.terms(field);
+ if (terms != null) {
+ TermsEnum termsEnum = terms.iterator();
+ tiq = fillQueue(termsEnum, numTerms, field);
+ }
+ } else {
+ Fields fields = MultiFields.getFields(reader);
+ if (fields == null) {
+ throw new FieldReaderException("no fields found for this index");
+ }
+ FieldsEnum fieldsEnum = fields.iterator();
+ while (true) {
+ field = fieldsEnum.next();
+ if (field != null) {
+ TermsEnum terms = fieldsEnum.terms();
+ tiq = fillQueue(terms, numTerms, field);
+ } else {
+ break;
+ }
+ }
+ }
+
+ TermStats[] result = new TermStats[tiq.size()];
+ // we want highest first so we read the queue and populate the array
+ // starting at the end and work backwards
+ int count = tiq.size() - 1;
+ while (tiq.size() != 0) {
+ result[count] = tiq.pop();
+ count--;
+ }
+ return result;
+ }
+
+ /**
+ * Takes array of TermStats. For each term looks up the tf for each doc
+ * containing the term and stores the total in the output array of TermStats.
+ * Output array is sorted by highest total tf.
+ *
+ * @param reader
+ * @param terms
+ * TermStats[]
+ * @return TermStats[]
+ * @throws Exception
+ */
+
+ public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
+ TermStats[] ts = new TermStats[terms.length]; // array for sorting
+ long totalTF;
+ for (int i = 0; i < terms.length; i++) {
+ totalTF = getTotalTermFreq(reader, terms[i].field, terms[i].termtext);
+ ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
+ }
+ //sort by totalTermFreq descending
+ Comparator c = new ReverseTotalTermFreqComparator();
+ Arrays.sort(ts, c);
+
+ return ts;
+ }
+
+ public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
+ BytesRef br = termtext;
+ long totalTF = 0;
+ Bits skipDocs = MultiFields.getDeletedDocs(reader);
+ DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
+ // if term is not in index return totalTF of 0
+ if (de == null) {
+ return 0;
+ }
+ // use DocsEnum.read() and BulkResult api
+ final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
+ int count;
+ while ((count = de.read()) != 0) {
+ final int[] freqs = bulkresult.freqs.ints;
+ final int limit = bulkresult.freqs.offset + count;
+ for(int i=bulkresult.freqs.offset;i [number_terms | number_terms field]\n\n");
+ }
+}
+
+/**
+ * Comparator
+ *
+ * Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
+ * b.totalTermFreq So we can sort in descending order of totalTermFreq
+ */
+
+final class ReverseTotalTermFreqComparator implements Comparator {
+
+ public int compare(TermStats a, TermStats b) {
+ if (a.totalTermFreq < b.totalTermFreq) {
+ return 1;
+ } else if (a.totalTermFreq > b.totalTermFreq) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+final class TermStats {
+ BytesRef termtext;
+ String field;
+ int docFreq;
+ long totalTermFreq;
+
+ TermStats(String field, BytesRef termtext, int df) {
+ this.termtext = new BytesRef(termtext);
+ this.field = field;
+ this.docFreq = df;
+ }
+
+ TermStats(String field, BytesRef termtext, int df, long tf) {
+ this.termtext = new BytesRef(termtext);
+ this.field = field;
+ this.docFreq = df;
+ this.totalTermFreq = tf;
+ }
+
+ String getTermText() {
+ return termtext.utf8ToString();
+ }
+}
+
+/**
+ * Priority queue for TermStats objects ordered by docFreq
+ **/
+final class TermStatsQueue extends PriorityQueue {
+ TermStatsQueue(int size) {
+ initialize(size);
+ }
+
+ @Override
+ protected final boolean lessThan(TermStats termInfoA, TermStats termInfoB) {
+ return termInfoA.docFreq < termInfoB.docFreq;
+ }
+}
Property changes on: lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTermsWithTF.java
___________________________________________________________________
Added: svn:eol-style
+ native