? build
? dist
? META-INF
? prj
? src/java/org/apache/lucene/index/SegmentTermPositionVector.java
? src/java/org/apache/lucene/index/TermVectorOffsetInfo.java
cvs server: Diffing .
cvs server: Diffing docs
cvs server: Diffing docs/images
cvs server: Diffing docs/lucene-sandbox
cvs server: Diffing docs/lucene-sandbox/larm
cvs server: Diffing lib
cvs server: Diffing src
cvs server: Diffing src/demo
cvs server: Diffing src/demo/org
cvs server: Diffing src/demo/org/apache
cvs server: Diffing src/demo/org/apache/lucene
cvs server: Diffing src/demo/org/apache/lucene/demo
cvs server: Diffing src/demo/org/apache/lucene/demo/html
cvs server: Diffing src/java
cvs server: Diffing src/java/org
cvs server: Diffing src/java/org/apache
cvs server: Diffing src/java/org/apache/lucene
cvs server: Diffing src/java/org/apache/lucene/analysis
cvs server: Diffing src/java/org/apache/lucene/analysis/de
cvs server: Diffing src/java/org/apache/lucene/analysis/ru
cvs server: Diffing src/java/org/apache/lucene/analysis/standard
cvs server: Diffing src/java/org/apache/lucene/document
Index: src/java/org/apache/lucene/document/Field.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.22
diff -u -r1.22 Field.java
--- src/java/org/apache/lucene/document/Field.java 1 Sep 2004 22:11:07 -0000 1.22
+++ src/java/org/apache/lucene/document/Field.java 8 Sep 2004 14:33:21 -0000
@@ -16,11 +16,12 @@
* limitations under the License.
*/
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Similarity;
+
import java.io.Reader;
-import java.util.Date;
-import org.apache.lucene.index.IndexReader; // for javadoc
-import org.apache.lucene.search.Similarity; // for javadoc
-import org.apache.lucene.search.Hits; // for javadoc
+import java.util.Date; // for javadoc
/**
A field is a section of a Document. Each field has two parts, a name and a
@@ -34,6 +35,8 @@
private String name = "body";
private String stringValue = null;
private boolean storeTermVector = false;
+ private boolean storeOffsetWithTermVector = false;
+ private boolean storePositionWithTermVector = false;
private Reader readerValue = null;
private boolean isStored = false;
private boolean isIndexed = true;
@@ -86,11 +89,22 @@
}
public static final class TermVector {
+ public static final boolean HAS_POSITIONS = true;
+ public static final boolean HAS_OFFSETS = true;
private String name;
+ private boolean storePositions = false;
+ private boolean storeOffsets = false;
private TermVector() {}
private TermVector(String name) {
this.name = name;
}
+
+ private TermVector(String name, boolean storeOffsets, boolean storePositions) {
+ this.name = name;
+ this.storeOffsets = storeOffsets;
+ this.storePositions = storePositions;
+ }
+
public String toString() {
return name;
}
@@ -100,6 +114,26 @@
/** Store the term vectors of each document. A term vector is a list
* of the document's terms and their number of occurences in that document. */
public static final TermVector YES = new TermVector("YES");
+ /**
+ * Store the term vector + token position information
+ *
+ * @see #YES
+ */
+ public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS", HAS_POSITIONS, !HAS_OFFSETS);
+ /**
+ * Store the term vector + Token offset information
+ *
+ * @see #YES
+ */
+ public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS", !HAS_POSITIONS, HAS_OFFSETS);
+ /**
+ * Store the term vector + Token position and offset information
+ *
+ * @see #YES
+ * @see #WITH_POSITIONS
+ * @see #WITH_OFFSETS
+ */
+ public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS", HAS_POSITIONS, HAS_OFFSETS);
}
/** Sets the boost factor hits on this field. This value will be
@@ -373,7 +407,25 @@
this.storeTermVector = false;
} else if (termVector == TermVector.YES) {
this.storeTermVector = true;
- } else {
+ this.storePositionWithTermVector = termVector.storePositions;
+ this.storeOffsetWithTermVector = termVector.storeOffsets;
+ }
+ else if (termVector == TermVector.WITH_POSITIONS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = true;
+ this.storeOffsetWithTermVector = false;
+ }
+ else if (termVector == TermVector.WITH_OFFSETS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = false;
+ this.storeOffsetWithTermVector = true;
+ }
+ else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = true;
+ this.storeOffsetWithTermVector = true;
+ }
+ else {
throw new IllegalArgumentException("unknown termVector parameter " + termVector);
}
}
@@ -402,6 +454,14 @@
*/
public final boolean isTermVectorStored() { return storeTermVector; }
+ public boolean isStoreOffsetWithTermVector(){
+ return storeOffsetWithTermVector;
+ }
+
+ public boolean isStorePositionWithTermVector(){
+ return storePositionWithTermVector;
+ }
+
/** Prints a Field for human consumption. */
public final String toString() {
StringBuffer result = new StringBuffer();
@@ -422,6 +482,16 @@
result.append(",");
result.append("termVector");
}
+ if (storeOffsetWithTermVector) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("termVectorOffsets");
+ }
+ if (storePositionWithTermVector) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("termVectorPosition");
+ }
result.append('<');
result.append(name);
result.append(':');
cvs server: Diffing src/java/org/apache/lucene/index
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
retrieving revision 1.12
diff -u -r1.12 DocumentWriter.java
--- src/java/org/apache/lucene/index/DocumentWriter.java 10 Jul 2004 06:19:01 -0000 1.12
+++ src/java/org/apache/lucene/index/DocumentWriter.java 8 Sep 2004 14:33:21 -0000
@@ -16,21 +16,21 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Hashtable;
-import java.util.Enumeration;
-import java.util.Arrays;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
-import org.apache.lucene.search.Similarity;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Enumeration;
+import java.util.Hashtable;
final class DocumentWriter {
private Analyzer analyzer;
@@ -125,7 +125,7 @@
if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field
- addPosition(fieldName, field.stringValue(), position++);
+ addPosition(fieldName, field.stringValue(), position++, new TermVectorOffsetInfo(-1,-1));
length++;
} else {
Reader reader; // find or make Reader
@@ -142,7 +142,7 @@
try {
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
- addPosition(fieldName, t.termText(), position++);
+ addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(t.startOffset(), t.endOffset()));
if (++length > maxFieldLength) break;
}
} finally {
@@ -159,8 +159,9 @@
private final Term termBuffer = new Term("", ""); // avoid consing
- private final void addPosition(String field, String text, int position) {
+ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
termBuffer.set(field, text);
+ //System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
@@ -172,10 +173,23 @@
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
+
+ if (offset != null) {
+ if (ti.offsets.length == freq){
+ TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
+ TermVectorOffsetInfo [] offsets = ti.offsets;
+ for (int i = 0; i < freq; i++)
+ {
+ newOffsets[i] = offsets[i];
+ }
+ ti.offsets = newOffsets;
+ }
+ ti.offsets[freq] = offset;
+ }
ti.freq = freq + 1; // update frequency
} else { // word not seen before
Term term = new Term(field, text, false);
- postingTable.put(term, new Posting(term, position));
+ postingTable.put(term, new Posting(term, position, offset));
}
}
@@ -294,12 +308,13 @@
termVectorWriter.openDocument();
}
termVectorWriter.openField(currentField);
+
} else if (termVectorWriter != null) {
termVectorWriter.closeField();
}
}
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
- termVectorWriter.addTerm(posting.term.text(), postingFreq);
+ termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
}
}
if (termVectorWriter != null)
@@ -336,11 +351,14 @@
Term term; // the Term
int freq; // its frequency in doc
int[] positions; // positions it occurs at
+ TermVectorOffsetInfo [] offsets;
- Posting(Term t, int position) {
+ Posting(Term t, int position, TermVectorOffsetInfo offset) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
+ offsets = new TermVectorOffsetInfo[1];
+ offsets[0] = offset;
}
}
Index: src/java/org/apache/lucene/index/FieldInfo.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfo.java,v
retrieving revision 1.3
diff -u -r1.3 FieldInfo.java
--- src/java/org/apache/lucene/index/FieldInfo.java 29 Mar 2004 22:48:02 -0000 1.3
+++ src/java/org/apache/lucene/index/FieldInfo.java 8 Sep 2004 14:33:21 -0000
@@ -23,11 +23,16 @@
// true if term vector for this field should be stored
boolean storeTermVector;
+ boolean storeOffsetWithTermVector = false;
+ boolean storePositionWithTermVector = false;
- FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) {
+ FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
name = na;
isIndexed = tk;
number = nu;
this.storeTermVector = storeTermVector;
+ this.storeOffsetWithTermVector = storeOffsetWithTermVector;
+ this.storePositionWithTermVector = storePositionWithTermVector;
}
}
Index: src/java/org/apache/lucene/index/FieldInfos.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v
retrieving revision 1.10
diff -u -r1.10 FieldInfos.java
--- src/java/org/apache/lucene/index/FieldInfos.java 25 Aug 2004 12:06:14 -0000 1.10
+++ src/java/org/apache/lucene/index/FieldInfos.java 8 Sep 2004 14:33:21 -0000
@@ -16,15 +16,14 @@
* limitations under the License.
*/
-import java.util.*;
-import java.io.IOException;
-
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.InputStream;
+import org.apache.lucene.store.OutputStream;
+
+import java.io.IOException;
+import java.util.*;
/** Access to the Field Info file that describes document fields and whether or
* not they are indexed. Each segment has a separate Field Info file. Objects
@@ -61,7 +60,8 @@
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
- add(field.name(), field.isIndexed(), field.isTermVectorStored());
+ add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(),
+ field.isStoreOffsetWithTermVector());
}
}
@@ -69,10 +69,11 @@
* @param names The names of the fields
* @param storeTermVectors Whether the fields store term vectors or not
*/
- public void addIndexed(Collection names, boolean storeTermVectors) {
+ public void addIndexed(Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector) {
Iterator i = names.iterator();
while (i.hasNext()) {
- add((String)i.next(), true, storeTermVectors);
+ add((String)i.next(), true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector);
}
}
@@ -94,13 +95,15 @@
* Calls three parameter add with false for the storeTermVector parameter
* @param name The name of the Field
* @param isIndexed true if the field is indexed
- * @see #add(String, boolean, boolean)
+ * @see #add(String, boolean, boolean, boolean, boolean)
*/
public void add(String name, boolean isIndexed) {
- add(name, isIndexed, false);
+ add(name, isIndexed, false, false, false);
}
-
+ public void add(String name, boolean isIndexed, boolean storeTermVector){
+ add(name, isIndexed, storeTermVector, false, false);
+ }
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for storeTermVector
@@ -109,10 +112,11 @@
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
*/
- public void add(String name, boolean isIndexed, boolean storeTermVector) {
+ public void add(String name, boolean isIndexed, boolean storeTermVector,
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
FieldInfo fi = fieldInfo(name);
if (fi == null) {
- addInternal(name, isIndexed, storeTermVector);
+ addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector);
} else {
if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index
@@ -120,13 +124,21 @@
if (fi.storeTermVector != storeTermVector) {
fi.storeTermVector = true; // once vector, always vector
}
+ if (fi.storePositionWithTermVector != storePositionWithTermVector) {
+ fi.storePositionWithTermVector = true; // once vector, always vector
+ }
+ if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) {
+ fi.storeOffsetWithTermVector = true; // once vector, always vector
+ }
}
}
private void addInternal(String name, boolean isIndexed,
- boolean storeTermVector) {
+ boolean storeTermVector, boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector) {
FieldInfo fi =
- new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector);
+ new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
+ storeOffsetWithTermVector);
byNumber.add(fi);
byName.put(name, fi);
}
@@ -182,6 +194,8 @@
byte bits = 0x0;
if (fi.isIndexed) bits |= 0x1;
if (fi.storeTermVector) bits |= 0x2;
+ if (fi.storePositionWithTermVector) bits |= 0x4;
+ if (fi.storeOffsetWithTermVector) bits |= 0x8;
output.writeString(fi.name);
//Was REMOVE
//output.writeByte((byte)(fi.isIndexed ? 1 : 0));
@@ -196,7 +210,9 @@
byte bits = input.readByte();
boolean isIndexed = (bits & 0x1) != 0;
boolean storeTermVector = (bits & 0x2) != 0;
- addInternal(name, isIndexed, storeTermVector);
+ boolean storePositionsWithTermVector = (bits & 0x4) != 0;
+ boolean storeOffsetWithTermVector = (bits & 0x8) != 0;
+ addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector);
}
}
Index: src/java/org/apache/lucene/index/FieldsReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v
retrieving revision 1.8
diff -u -r1.8 FieldsReader.java
--- src/java/org/apache/lucene/index/FieldsReader.java 1 Sep 2004 20:04:12 -0000 1.8
+++ src/java/org/apache/lucene/index/FieldsReader.java 8 Sep 2004 14:33:21 -0000
@@ -16,12 +16,12 @@
* limitations under the License.
*/
-import java.io.IOException;
-
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.InputStream;
+
+import java.io.IOException;
/**
* Class responsible for access to stored document fields.
@@ -75,12 +75,27 @@
index = Field.Index.UN_TOKENIZED;
else
index = Field.Index.NO;
+ Field.TermVector termVector = Field.TermVector.NO;
+ if (fi.storeTermVector == true){
+ if (fi.storePositionWithTermVector == true && fi.storeOffsetWithTermVector == true) {
+ termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+ }
+ else if (fi.storePositionWithTermVector == true && fi.storeOffsetWithTermVector == false) {
+ termVector = Field.TermVector.WITH_POSITIONS;
+ }
+ else if (fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == true) {
+ termVector = Field.TermVector.WITH_OFFSETS;
+ }
+ }
+ else{
+ termVector = Field.TermVector.NO;
+ }
+ //termVector = fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO;
doc.add(new Field(fi.name, // name
fieldsStream.readString(), // read value
Field.Store.YES, index,
- fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+ termVector));
}
-
return doc;
}
}
Index: src/java/org/apache/lucene/index/FilterIndexReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java,v
retrieving revision 1.12
diff -u -r1.12 FilterIndexReader.java
--- src/java/org/apache/lucene/index/FilterIndexReader.java 14 Jun 2004 00:15:24 -0000 1.12
+++ src/java/org/apache/lucene/index/FilterIndexReader.java 8 Sep 2004 14:33:21 -0000
@@ -16,11 +16,11 @@
* limitations under the License.
*/
+import org.apache.lucene.document.Document;
+
import java.io.IOException;
import java.util.Collection;
-import org.apache.lucene.document.Document;
-
/** A FilterIndexReader contains another IndexReader, which it
* uses as its basic source of data, possibly transforming the data along the
* way or providing additional functionality. The class
@@ -145,5 +145,9 @@
*/
public Collection getIndexedFieldNames(boolean storedTermVector) {
return in.getIndexedFieldNames(storedTermVector);
+ }
+
+ public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
+ return in.getTermVectorFieldNames(storePositionWithTermVector, storeOffsetWithTermVector);
}
}
Index: src/java/org/apache/lucene/index/IndexReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
retrieving revision 1.35
diff -u -r1.35 IndexReader.java
--- src/java/org/apache/lucene/index/IndexReader.java 15 Aug 2004 20:49:30 -0000 1.35
+++ src/java/org/apache/lucene/index/IndexReader.java 8 Sep 2004 14:33:21 -0000
@@ -16,16 +16,16 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.io.File;
-import java.util.Collection;
-
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field; // for javadoc
-import org.apache.lucene.search.Similarity;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -554,6 +554,26 @@
* @return Collection of Strings indicating the names of the fields
*/
public abstract Collection getIndexedFieldNames(boolean storedTermVector);
+
+ /**
+ * Get all FieldNames that have term vector information, as well as position and/or offset information
+ * @param storePositionWithTermVector
+ * @param storeOffsetWithTermVector
+ * @return
+ */
+ public abstract Collection getTermVectorFieldNames(boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector);
+
+ /**
+ *
+ * @param storedTermVector
+ * @return Collection of Strings indicating the names of the fields
+ * @see #getIndexedFieldNames(boolean, boolean, boolean) with the last two as false and false
+ */
+ /*public Collection getIndexedFieldNames(boolean storedTermVector)
+ {
+ return getIndexedFieldNames(storedTermVector, false, false);
+ }*/
/**
* Returns true iff the index in the named directory is
Index: src/java/org/apache/lucene/index/MultiReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/MultiReader.java,v
retrieving revision 1.8
diff -u -r1.8 MultiReader.java
--- src/java/org/apache/lucene/index/MultiReader.java 6 Aug 2004 20:50:29 -0000 1.8
+++ src/java/org/apache/lucene/index/MultiReader.java 8 Sep 2004 14:33:21 -0000
@@ -16,16 +16,12 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.Set;
-
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
+import java.io.IOException;
+import java.util.*;
+
/** An IndexReader which reads multiple indexes, appending their content.
*
* @version $Id: MultiReader.java,v 1.8 2004/08/06 20:50:29 dnaber Exp $
@@ -248,6 +244,17 @@
for (int i = 0; i < subReaders.length; i++) {
IndexReader reader = subReaders[i];
Collection names = reader.getIndexedFieldNames(storedTermVector);
+ fieldSet.addAll(names);
+ }
+ return fieldSet;
+ }
+
+ public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
+ // maintain a unique set of field names
+ Set fieldSet = new HashSet();
+ for (int i = 0; i < subReaders.length; i++) {
+ IndexReader reader = subReaders[i];
+ Collection names = reader.getTermVectorFieldNames(storePositionWithTermVector, storeOffsetWithTermVector);
fieldSet.addAll(names);
}
return fieldSet;
Index: src/java/org/apache/lucene/index/SegmentMerger.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v
retrieving revision 1.14
diff -u -r1.14 SegmentMerger.java
--- src/java/org/apache/lucene/index/SegmentMerger.java 15 Aug 2004 11:26:05 -0000 1.14
+++ src/java/org/apache/lucene/index/SegmentMerger.java 8 Sep 2004 14:33:22 -0000
@@ -16,14 +16,14 @@
* limitations under the License.
*/
-import java.util.Vector;
-import java.util.Iterator;
-import java.io.IOException;
-
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.RAMOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Vector;
+
/**
* The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
* into a single Segment. After adding the appropriate readers, call the merge method to combine the
@@ -157,8 +157,11 @@
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
- fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true);
- fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false);
+ //Can only store position and offset information when storing term vectors
+ fieldInfos.addIndexed(reader.getTermVectorFieldNames(true, true), true, true, true);
+ fieldInfos.addIndexed(reader.getTermVectorFieldNames(true, false), true, true, false);
+ fieldInfos.addIndexed(reader.getTermVectorFieldNames(false, false), true, false, false);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false, false, false);
fieldInfos.add(reader.getFieldNames(false), false);
}
fieldInfos.write(directory, segment + ".fnm");
@@ -211,9 +214,15 @@
termVectorsWriter.openField(termVector.getField());
String [] terms = termVector.getTerms();
int [] freqs = termVector.getTermFrequencies();
+ boolean positionVector = termVector instanceof TermPositionVector ? true : false;
for (int t = 0; t < terms.length; t++) {
- termVectorsWriter.addTerm(terms[t], freqs[t]);
+ if (positionVector == false) {
+ termVectorsWriter.addTerm(terms[t], freqs[t]);
+ } else {
+ termVectorsWriter.addTerm(terms[t], freqs[t], ((TermPositionVector)termVector).getTermPositions(t),
+ ((TermPositionVector)termVector).getOffsets(t));
+ }
}
}
termVectorsWriter.closeDocument();
Index: src/java/org/apache/lucene/index/SegmentReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java,v
retrieving revision 1.26
diff -u -r1.26 SegmentReader.java
--- src/java/org/apache/lucene/index/SegmentReader.java 17 Aug 2004 08:56:08 -0000 1.26
+++ src/java/org/apache/lucene/index/SegmentReader.java 8 Sep 2004 14:33:22 -0000
@@ -16,20 +16,15 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Set;
-import java.util.Vector;
-
import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.OutputStream;
-import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;
+import java.io.IOException;
+import java.util.*;
+
/**
* FIXME: Describe class SegmentReader here.
*
@@ -325,6 +320,26 @@
}
return fieldSet;
+ }
+
+ /**
+ * Get all FieldNames that have term vector information, as well as position and/or offset information
+ *
+ * @param storePositionWithTermVector
+ * @param storeOffsetWithTermVector
+ * @return
+ */
+ public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
+ // maintain a unique set of field names
+ Set fieldSet = new HashSet();
+ for (int i = 0; i < fieldInfos.size(); i++) {
+ FieldInfo fi = fieldInfos.fieldInfo(i);
+ if (fi.isIndexed == true && fi.storeTermVector == true && fi.storePositionWithTermVector == storePositionWithTermVector
+ && fi.storeOffsetWithTermVector == storeOffsetWithTermVector){
+ fieldSet.add(fi.name);
+ }
+ }
+ return fieldSet;
}
public synchronized byte[] norms(String field) throws IOException {
Index: src/java/org/apache/lucene/index/SegmentTermVector.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermVector.java,v
retrieving revision 1.3
diff -u -r1.3 SegmentTermVector.java
--- src/java/org/apache/lucene/index/SegmentTermVector.java 10 Jul 2004 06:19:01 -0000 1.3
+++ src/java/org/apache/lucene/index/SegmentTermVector.java 8 Sep 2004 14:33:22 -0000
@@ -4,9 +4,9 @@
/**
*/
class SegmentTermVector implements TermFreqVector {
- private String field;
- private String terms[];
- private int termFreqs[];
+ protected String field;
+ protected String terms[];
+ protected int termFreqs[];
SegmentTermVector(String field, String terms[], int termFreqs[]) {
this.field = field;
Index: src/java/org/apache/lucene/index/TermPositionVector.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermPositionVector.java,v
retrieving revision 1.1
diff -u -r1.1 TermPositionVector.java
--- src/java/org/apache/lucene/index/TermPositionVector.java 20 Feb 2004 20:14:55 -0000 1.1
+++ src/java/org/apache/lucene/index/TermPositionVector.java 8 Sep 2004 14:33:22 -0000
@@ -4,10 +4,20 @@
* positions in which each of the terms is found.
*/
public interface TermPositionVector extends TermFreqVector {
-
+
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
- * term number array obtained from getTermNumbers method.
+ * term String array obtained from the indexOf method.
*/
public int[] getTermPositions(int index);
+
+ /**
+ * Returns an array of TermVectorOffsetInfo in which the term is found.
+ *
+ * @see org.apache.lucene.analysis.Token
+ *
+ * @param index The position in the array to get the offsets from
+ * @return An array of TermVectorOffsetInfo objects or the empty list
+ */
+ public TermVectorOffsetInfo [] getOffsets(int index);
}
Index: src/java/org/apache/lucene/index/TermVectorsReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermVectorsReader.java,v
retrieving revision 1.3
diff -u -r1.3 TermVectorsReader.java
--- src/java/org/apache/lucene/index/TermVectorsReader.java 17 Aug 2004 20:53:16 -0000 1.3
+++ src/java/org/apache/lucene/index/TermVectorsReader.java 8 Sep 2004 14:33:22 -0000
@@ -210,11 +210,16 @@
if (numTerms == 0) return new SegmentTermVector(field, null, null);
tvf.readVInt();
-
+ byte storePosByte = tvf.readByte();
+ byte storeOffByte = tvf.readByte();
+
String terms[] = new String[numTerms];
int termFreqs[] = new int[numTerms];
-
+ int positions[][];
+ TermVectorOffsetInfo offsets[][];//we may not need these, but declare them
+ positions = new int[numTerms][];
+ offsets = new TermVectorOffsetInfo[numTerms][];
int start = 0;
int deltaLength = 0;
int totalLength = 0;
@@ -234,8 +239,36 @@
terms[i] = new String(buffer, 0, totalLength);
previousString = terms[i];
termFreqs[i] = tvf.readVInt();
+ //Next byte is whether we are storing position info, if 1, then we are.
+ byte storingInfo = tvf.readByte();
+ if (storePosByte == 1 && storingInfo == 1)//should only be 1 when storePosInfo is 1
+ { //read in the positions
+ int numPositions = tvf.readVInt();
+ int [] pos = new int[numPositions];
+ positions[i] = pos;
+ for (int j = 0; j < numPositions; j++)
+ {
+ pos[j] = tvf.readVInt();
+ }
+ }
+ storingInfo = tvf.readByte();
+ if (storeOffByte == 1 && storingInfo == 1)
+ {
+ int numOffsets = tvf.readVInt();
+ TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[numOffsets];
+ offsets[i] = offs;
+ for (int j = 0; j < numOffsets; j++) {
+ offs[j] = new TermVectorOffsetInfo(tvf.readVInt(), tvf.readVInt());
+ }
+ }
+ }
+ SegmentTermVector tv;
+ if (storePosByte == 1 || storeOffByte == 1){
+ tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+ }
+ else {
+ tv = new SegmentTermVector(field, terms, termFreqs);
}
- SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
return tv;
}
Index: src/java/org/apache/lucene/index/TermVectorsWriter.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java,v
retrieving revision 1.2
diff -u -r1.2 TermVectorsWriter.java
--- src/java/org/apache/lucene/index/TermVectorsWriter.java 17 Aug 2004 20:53:16 -0000 1.2
+++ src/java/org/apache/lucene/index/TermVectorsWriter.java 8 Sep 2004 14:33:22 -0000
@@ -65,16 +65,9 @@
private TVField currentField = null;
private long currentDocPointer = -1;
-
- /** Create term vectors writer for the specified segment in specified
- * directory. A new TermVectorsWriter should be created for each
- * segment. The parameter maxFields indicates how many total
- * fields are found in this document. Not all of these fields may require
- * termvectors to be stored, so the number of calls to
- * openField is less or equal to this number.
- */
- public TermVectorsWriter(Directory directory, String segment,
- FieldInfos fieldInfos)
+
+
+ public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos)
throws IOException {
// Open files for TermVector storage
tvx = directory.createFile(segment + TVX_EXTENSION);
@@ -83,12 +76,12 @@
tvd.writeInt(FORMAT_VERSION);
tvf = directory.createFile(segment + TVF_EXTENSION);
tvf.writeInt(FORMAT_VERSION);
-
this.fieldInfos = fieldInfos;
fields = new Vector(fieldInfos.size());
terms = new Vector();
}
+
public final void openDocument()
throws IOException {
@@ -124,7 +117,9 @@
if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
closeField();
- currentField = new TVField(fieldInfos.fieldNumber(field));
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ currentField = new TVField(fieldInfo.number, fieldInfo.storePositionWithTermVector,
+ fieldInfo.storeOffsetWithTermVector);
}
/** Finished processing current field. This should be followed by a call to
@@ -160,14 +155,21 @@
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
- addTermInternal(termText, freq);
+ addTermInternal(termText, freq, null, null);
+ }
+
+ public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets)
+ {
+ addTermInternal(termText, freq, positions, offsets);
}
- private final void addTermInternal(String termText, int freq) {
+ private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) {
currentField.length += freq;
TVTerm term = new TVTerm();
term.termText = termText;
term.freq = freq;
+ term.positions = positions;
+ term.offsets = offsets;
terms.add(term);
}
@@ -197,16 +199,47 @@
addTermFreqVectorInternal(vector);
}
+ /** Add specified vectors to the document.
+ */
+ public final void addPositionVectors(TermPositionVector[] vectors)
+ throws IOException {
+ if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
+ if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
+
+ for (int i = 0; i < vectors.length; i++) {
+ addTermPositionVector(vectors[i]);
+ }
+ }
+
+
+ /** Add specified vector to the document. Document must be open but no field
+ * should be open or exception is thrown. The same document can have addTerm
+ * and addVectors calls mixed, however a given field must either be
+ * populated with addTerm or with addVector. *
+ */
+ public final void addTermPositionVector(TermPositionVector vector)
+ throws IOException {
+ if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
+ if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
+ addTermPositionVectorInternal(vector);
+ }
private final void addTermFreqVectorInternal(TermFreqVector vector)
throws IOException {
openField(vector.getField());
for (int i = 0; i < vector.size(); i++) {
- addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
+ addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i], null, null);
}
closeField();
}
-
+ private final void addTermPositionVectorInternal(TermPositionVector vector)
+ throws IOException {
+ openField(vector.getField());
+ for (int i = 0; i < vector.size(); i++) {
+ addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i], vector.getTermPositions(i), vector.getOffsets(i));
+ }
+ closeField();
+ }
/** Close all streams. */
@@ -249,22 +282,101 @@
tvf.writeVInt(size = terms.size());
tvf.writeVInt(currentField.length - size);
+ boolean storePositions = currentField.storePositions;
+ boolean storeOffsets = currentField.storeOffsets;
+ tvf.writeByte(storePositions == true ? (byte)1 :(byte)0);
+ tvf.writeByte(storeOffsets == true ? (byte)1 : (byte)0);
String lastTermText = "";
// write term ids and positions
- for (int i = 0; i < size; i++) {
- TVTerm term = (TVTerm) terms.elementAt(i);
- //tvf.writeString(term.termText);
- int start = StringHelper.stringDifference(lastTermText, term.termText);
- int length = term.termText.length() - start;
- tvf.writeVInt(start); // write shared prefix length
- tvf.writeVInt(length); // write delta length
- tvf.writeChars(term.termText, start, length); // write delta chars
- tvf.writeVInt(term.freq);
- lastTermText = term.termText;
+ //Do it this way, so we don't have to check the flags inside the loop
+ if (storePositions == false && storeOffsets == false)
+ {
+ for (int i = 0; i < size; i++) {
+ TVTerm term = (TVTerm) terms.elementAt(i);
+ //tvf.writeString(term.termText);
+ writeCoreTermInfo(lastTermText, term);
+ writePositions(null, 0);//store the fact that we aren't storing the info
+ writeOffsets(null, 0);
+ lastTermText = term.termText;
+ }
+ }
+ else if (storePositions == true && storeOffsets == false)
+ {
+ for (int i = 0; i < size; i++) {
+ TVTerm term = (TVTerm) terms.elementAt(i);
+ //tvf.writeString(term.termText);
+ writeCoreTermInfo(lastTermText, term);
+ writePositions(term.positions, term.freq);
+ writeOffsets(null, 0);//store the fact that we aren't storing offsets
+ lastTermText = term.termText;
+ }
+ }
+ else if (storePositions == false && storeOffsets == true)
+ {
+ for (int i = 0; i < size; i++) {
+ TVTerm term = (TVTerm) terms.elementAt(i);
+ //tvf.writeString(term.termText);
+ writeCoreTermInfo(lastTermText, term);
+ writePositions(null, 0);
+ writeOffsets(term.offsets, term.freq);
+ lastTermText = term.termText;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < size; i++) {
+ TVTerm term = (TVTerm) terms.elementAt(i);
+ //tvf.writeString(term.termText);
+ writeCoreTermInfo(lastTermText, term);
+ writePositions(term.positions, term.freq);
+ writeOffsets(term.offsets, term.freq);
+ lastTermText = term.termText;
+ }
}
}
-
+ private void writeCoreTermInfo(String lastTermText, TVTerm term) throws IOException {
+ int start = StringHelper.stringDifference(lastTermText, term.termText);
+ int length = term.termText.length() - start;
+ tvf.writeVInt(start); // write shared prefix length
+ tvf.writeVInt(length); // write delta length
+ tvf.writeChars(term.termText, start, length); // write delta chars
+ tvf.writeVInt(term.freq);
+ }
+
+ private void writePositions(int [] positions, int freq) throws IOException
+ {
+ if (positions != null && positions.length > 0)
+ {
+ tvf.writeByte((byte)1);
+ tvf.writeVInt(freq);
+ for (int i = 0; i < freq; i++) {
+ tvf.writeVInt(positions[i]);
+ }
+ }
+ else
+ {
+ tvf.writeByte((byte)0);
+ }
+
+ }
+ private void writeOffsets(TermVectorOffsetInfo [] offsets, int freq) throws IOException
+ {
+ if (offsets != null && offsets.length > 0)
+ {
+ tvf.writeByte((byte)1);
+ tvf.writeVInt(freq);
+
+ for (int i = 0; i < freq; i++) {
+ tvf.writeVInt(offsets[i].getStartOffset());
+ tvf.writeVInt(offsets[i].getEndOffset() - offsets[i].getStartOffset()); //Save the diff between the two.
+ }
+ }
+ else
+ {
+ tvf.writeByte((byte)0);
+ }
+ }
private void writeDoc() throws IOException {
@@ -304,16 +416,20 @@
int number;
long tvfPointer = 0;
int length = 0; // number of distinct term positions
-
- TVField(int number) {
+ boolean storePositions = false;
+ boolean storeOffsets = false;
+ TVField(int number, boolean storePos, boolean storeOff) {
this.number = number;
+ storePositions = storePos;
+ storeOffsets = storeOff;
}
}
private static class TVTerm {
String termText;
int freq = 0;
- //int positions[] = null;
+ int positions[] = null;
+ TermVectorOffsetInfo [] offsets = null;
}
cvs server: Diffing src/java/org/apache/lucene/queryParser
cvs server: Diffing src/java/org/apache/lucene/search
cvs server: Diffing src/java/org/apache/lucene/search/spans
cvs server: Diffing src/java/org/apache/lucene/store
cvs server: Diffing src/java/org/apache/lucene/util
cvs server: Diffing src/jsp
cvs server: Diffing src/jsp/WEB-INF
cvs server: Diffing src/test
cvs server: Diffing src/test/org
cvs server: Diffing src/test/org/apache
cvs server: Diffing src/test/org/apache/lucene
cvs server: Diffing src/test/org/apache/lucene/analysis
cvs server: Diffing src/test/org/apache/lucene/analysis/de
cvs server: Diffing src/test/org/apache/lucene/analysis/ru
cvs server: Diffing src/test/org/apache/lucene/document
cvs server: Diffing src/test/org/apache/lucene/index
Index: src/test/org/apache/lucene/index/DocHelper.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/DocHelper.java,v
retrieving revision 1.1
diff -u -r1.1 DocHelper.java
--- src/test/org/apache/lucene/index/DocHelper.java 20 Feb 2004 20:14:55 -0000 1.1
+++ src/test/org/apache/lucene/index/DocHelper.java 8 Sep 2004 14:33:22 -0000
@@ -1,159 +1,159 @@
-package org.apache.lucene.index;
-
-/**
- * Created by IntelliJ IDEA.
- * User: Grant Ingersoll
- * Date: Feb 2, 2004
- * Time: 6:16:12 PM
- * $Id: DocHelper.java,v 1.1 2004/02/20 20:14:55 cutting Exp $
- * Copyright 2004. Center For Natural Language Processing
- */
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.store.Directory;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Enumeration;
-
-/**
- *
- *
- **/
-class DocHelper {
- public static final String FIELD_1_TEXT = "field one text";
- public static final String TEXT_FIELD_1_KEY = "textField1";
- public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false);
-
- public static final String FIELD_2_TEXT = "field field field two text";
- //Fields will be lexicographically sorted. So, the order is: field, text, two
- public static final int [] FIELD_2_FREQS = {3, 1, 1};
- public static final String TEXT_FIELD_2_KEY = "textField2";
- public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true);
-
- public static final String KEYWORD_TEXT = "Keyword";
- public static final String KEYWORD_FIELD_KEY = "keyField";
- public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
-
- public static final String UNINDEXED_FIELD_TEXT = "unindexed field text";
- public static final String UNINDEXED_FIELD_KEY = "unIndField";
- public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
-
- public static final String UNSTORED_1_FIELD_TEXT = "unstored field text";
- public static final String UNSTORED_FIELD_1_KEY = "unStoredField1";
- public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false);
-
- public static final String UNSTORED_2_FIELD_TEXT = "unstored field text";
- public static final String UNSTORED_FIELD_2_KEY = "unStoredField2";
- public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true);
-
-// public static Set fieldNamesSet = null;
-// public static Set fieldValuesSet = null;
- public static Map nameValues = null;
-
- static
- {
-
- nameValues = new HashMap();
- nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT);
- nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT);
- nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
- nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
- nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT);
- nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT);
- }
-
- /**
- * Adds the fields above to a document
- * @param doc The document to write
- */
- public static void setupDoc(Document doc) {
- doc.add(textField1);
- doc.add(textField2);
- doc.add(keyField);
- doc.add(unIndField);
- doc.add(unStoredField1);
- doc.add(unStoredField2);
- }
- /**
- * Writes the document to the directory using a segment named "test"
- * @param dir
- * @param doc
- */
- public static void writeDoc(Directory dir, Document doc)
- {
-
- writeDoc(dir, "test", doc);
- }
- /**
- * Writes the document to the directory in the given segment
- * @param dir
- * @param segment
- * @param doc
- */
- public static void writeDoc(Directory dir, String segment, Document doc)
- {
- Analyzer analyzer = new WhitespaceAnalyzer();
- Similarity similarity = Similarity.getDefault();
- writeDoc(dir, analyzer, similarity, segment, doc);
- }
- /**
- * Writes the document to the directory segment named "test" using the specified analyzer and similarity
- * @param dir
- * @param analyzer
- * @param similarity
- * @param doc
- */
- public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc)
- {
- writeDoc(dir, analyzer, similarity, "test", doc);
- }
- /**
- * Writes the document to the directory segment using the analyzer and the similarity score
- * @param dir
- * @param analyzer
- * @param similarity
- * @param segment
- * @param doc
- */
- public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc)
- {
- DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
- try {
- writer.addDocument(segment, doc);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public static int numFields(Document doc) {
- Enumeration fields = doc.fields();
- int result = 0;
- while (fields.hasMoreElements()) {
- fields.nextElement();
- result++;
- }
- return result;
- }
-}
-/*
- fieldNamesSet = new HashSet();
- fieldNamesSet.add(TEXT_FIELD_1_KEY);
- fieldNamesSet.add(TEXT_FIELD_2_KEY);
- fieldNamesSet.add(KEYWORD_FIELD_KEY);
- fieldNamesSet.add(UNINDEXED_FIELD_KEY);
- fieldNamesSet.add(UNSTORED_FIELD_1_KEY);
- fieldNamesSet.add(UNSTORED_FIELD_2_KEY);
- fieldValuesSet = new HashSet();
- fieldValuesSet.add(FIELD_1_TEXT);
- fieldValuesSet.add(FIELD_2_TEXT);
- fieldValuesSet.add(KEYWORD_TEXT);
- fieldValuesSet.add(UNINDEXED_FIELD_TEXT);
- fieldValuesSet.add(UNSTORED_1_FIELD_TEXT);
- fieldValuesSet.add(UNSTORED_2_FIELD_TEXT);
-*/
+package org.apache.lucene.index;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: Grant Ingersoll
+ * Date: Feb 2, 2004
+ * Time: 6:16:12 PM
+ * $Id: DocHelper.java,v 1.1 2004/02/20 20:14:55 cutting Exp $
+ * Copyright 2004. Center For Natural Language Processing
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Enumeration;
+
+/**
+ *
+ *
+ **/
+class DocHelper {
+ public static final String FIELD_1_TEXT = "field one text";
+ public static final String TEXT_FIELD_1_KEY = "textField1";
+ public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false);
+
+ public static final String FIELD_2_TEXT = "field field field two text";
+ //Fields will be lexicographically sorted. So, the order is: field, text, two
+ public static final int [] FIELD_2_FREQS = {3, 1, 1};
+ public static final String TEXT_FIELD_2_KEY = "textField2";
+ public static Field textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+
+ public static final String KEYWORD_TEXT = "Keyword";
+ public static final String KEYWORD_FIELD_KEY = "keyField";
+ public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
+
+ public static final String UNINDEXED_FIELD_TEXT = "unindexed field text";
+ public static final String UNINDEXED_FIELD_KEY = "unIndField";
+ public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
+
+ public static final String UNSTORED_1_FIELD_TEXT = "unstored field text";
+ public static final String UNSTORED_FIELD_1_KEY = "unStoredField1";
+ public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false);
+
+ public static final String UNSTORED_2_FIELD_TEXT = "unstored field text";
+ public static final String UNSTORED_FIELD_2_KEY = "unStoredField2";
+ public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true);
+
+// public static Set fieldNamesSet = null;
+// public static Set fieldValuesSet = null;
+ public static Map nameValues = null;
+
+ static
+ {
+
+ nameValues = new HashMap();
+ nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT);
+ nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT);
+ nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
+ nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
+ nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT);
+ nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT);
+ }
+
+ /**
+ * Adds the fields above to a document
+ * @param doc The document to write
+ */
+ public static void setupDoc(Document doc) {
+ doc.add(textField1);
+ doc.add(textField2);
+ doc.add(keyField);
+ doc.add(unIndField);
+ doc.add(unStoredField1);
+ doc.add(unStoredField2);
+ }
+ /**
+ * Writes the document to the directory using a segment named "test"
+ * @param dir
+ * @param doc
+ */
+ public static void writeDoc(Directory dir, Document doc)
+ {
+
+ writeDoc(dir, "test", doc);
+ }
+ /**
+ * Writes the document to the directory in the given segment
+ * @param dir
+ * @param segment
+ * @param doc
+ */
+ public static void writeDoc(Directory dir, String segment, Document doc)
+ {
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ Similarity similarity = Similarity.getDefault();
+ writeDoc(dir, analyzer, similarity, segment, doc);
+ }
+ /**
+ * Writes the document to the directory segment named "test" using the specified analyzer and similarity
+ * @param dir
+ * @param analyzer
+ * @param similarity
+ * @param doc
+ */
+ public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc)
+ {
+ writeDoc(dir, analyzer, similarity, "test", doc);
+ }
+ /**
+ * Writes the document to the directory segment using the analyzer and the similarity score
+ * @param dir
+ * @param analyzer
+ * @param similarity
+ * @param segment
+ * @param doc
+ */
+ public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc)
+ {
+ DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+ try {
+ writer.addDocument(segment, doc);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static int numFields(Document doc) {
+ Enumeration fields = doc.fields();
+ int result = 0;
+ while (fields.hasMoreElements()) {
+ String name = fields.nextElement().toString();
+ result++;
+ }
+ return result;
+ }
+}
+/*
+ fieldNamesSet = new HashSet();
+ fieldNamesSet.add(TEXT_FIELD_1_KEY);
+ fieldNamesSet.add(TEXT_FIELD_2_KEY);
+ fieldNamesSet.add(KEYWORD_FIELD_KEY);
+ fieldNamesSet.add(UNINDEXED_FIELD_KEY);
+ fieldNamesSet.add(UNSTORED_FIELD_1_KEY);
+ fieldNamesSet.add(UNSTORED_FIELD_2_KEY);
+ fieldValuesSet = new HashSet();
+ fieldValuesSet.add(FIELD_1_TEXT);
+ fieldValuesSet.add(FIELD_2_TEXT);
+ fieldValuesSet.add(KEYWORD_TEXT);
+ fieldValuesSet.add(UNINDEXED_FIELD_TEXT);
+ fieldValuesSet.add(UNSTORED_1_FIELD_TEXT);
+ fieldValuesSet.add(UNSTORED_2_FIELD_TEXT);
+*/
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestDocumentWriter.java,v
retrieving revision 1.2
diff -u -r1.2 TestDocumentWriter.java
--- src/test/org/apache/lucene/index/TestDocumentWriter.java 29 Mar 2004 22:48:06 -0000 1.2
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java 8 Sep 2004 14:33:22 -0000
@@ -1,83 +1,83 @@
-package org.apache.lucene.index;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
-import java.io.IOException;
-
-public class TestDocumentWriter extends TestCase {
- private RAMDirectory dir = new RAMDirectory();
- private Document testDoc = new Document();
-
-
- public TestDocumentWriter(String s) {
- super(s);
- }
-
- protected void setUp() {
- DocHelper.setupDoc(testDoc);
- }
-
- protected void tearDown() {
-
- }
-
- public void test() {
- assertTrue(dir != null);
-
- }
-
- public void testAddDocument() {
- Analyzer analyzer = new WhitespaceAnalyzer();
- Similarity similarity = Similarity.getDefault();
- DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
- assertTrue(writer != null);
- try {
- writer.addDocument("test", testDoc);
- //After adding the document, we should be able to read it back in
- SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
- assertTrue(reader != null);
- Document doc = reader.document(0);
- assertTrue(doc != null);
-
- //System.out.println("Document: " + doc);
- Field [] fields = doc.getFields("textField2");
- assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
- assertTrue(fields[0].isTermVectorStored() == true);
-
- fields = doc.getFields("textField1");
- assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
- assertTrue(fields[0].isTermVectorStored() == false);
-
- fields = doc.getFields("keyField");
- assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-}
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import java.io.IOException;
+
+public class TestDocumentWriter extends TestCase {
+ private RAMDirectory dir = new RAMDirectory();
+ private Document testDoc = new Document();
+
+
+ public TestDocumentWriter(String s) {
+ super(s);
+ }
+
+ protected void setUp() {
+ DocHelper.setupDoc(testDoc);
+ }
+
+ protected void tearDown() {
+
+ }
+
+ public void test() {
+ assertTrue(dir != null);
+
+ }
+
+ public void testAddDocument() {
+ Analyzer analyzer = new WhitespaceAnalyzer();
+ Similarity similarity = Similarity.getDefault();
+ DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+ assertTrue(writer != null);
+ try {
+ writer.addDocument("test", testDoc);
+ //After adding the document, we should be able to read it back in
+ SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
+ assertTrue(reader != null);
+ Document doc = reader.document(0);
+ assertTrue(doc != null);
+
+ //System.out.println("Document: " + doc);
+ Field [] fields = doc.getFields("textField2");
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
+ assertTrue(fields[0].isTermVectorStored() == true);
+
+ fields = doc.getFields("textField1");
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
+ assertTrue(fields[0].isTermVectorStored() == false);
+
+ fields = doc.getFields("keyField");
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+}
Index: src/test/org/apache/lucene/index/TestSegmentMerger.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java,v
retrieving revision 1.4
diff -u -r1.4 TestSegmentMerger.java
--- src/test/org/apache/lucene/index/TestSegmentMerger.java 8 Aug 2004 13:05:33 -0000 1.4
+++ src/test/org/apache/lucene/index/TestSegmentMerger.java 8 Sep 2004 14:33:22 -0000
@@ -109,6 +109,7 @@
int [] freqs = vector.getTermFrequencies();
assertTrue(freqs != null);
//System.out.println("Freqs size: " + freqs.length);
+ assertTrue(vector instanceof TermPositionVector == true);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
Index: src/test/org/apache/lucene/index/TestSegmentReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestSegmentReader.java,v
retrieving revision 1.3
diff -u -r1.3 TestSegmentReader.java
--- src/test/org/apache/lucene/index/TestSegmentReader.java 6 Aug 2004 21:32:51 -0000 1.3
+++ src/test/org/apache/lucene/index/TestSegmentReader.java 8 Sep 2004 14:33:22 -0000
@@ -1,199 +1,199 @@
-package org.apache.lucene.index;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.Enumeration;
-
-public class TestSegmentReader extends TestCase {
- private RAMDirectory dir = new RAMDirectory();
- private Document testDoc = new Document();
- private SegmentReader reader = null;
-
- public TestSegmentReader(String s) {
- super(s);
- }
-
- //TODO: Setup the reader w/ multiple documents
- protected void setUp() {
-
- try {
- DocHelper.setupDoc(testDoc);
- DocHelper.writeDoc(dir, testDoc);
- reader = new SegmentReader(new SegmentInfo("test", 1, dir));
- } catch (IOException e) {
-
- }
- }
-
- protected void tearDown() {
-
- }
-
- public void test() {
- assertTrue(dir != null);
- assertTrue(reader != null);
- assertTrue(DocHelper.nameValues.size() > 0);
- assertTrue(DocHelper.numFields(testDoc) == 6);
- }
-
- public void testDocument() {
- try {
- assertTrue(reader.numDocs() == 1);
- assertTrue(reader.maxDoc() >= 1);
- Document result = reader.document(0);
- assertTrue(result != null);
- //There are 2 unstored fields on the document that are not preserved across writing
- assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2);
-
- Enumeration fields = result.fields();
- while (fields.hasMoreElements()) {
- Field field = (Field) fields.nextElement();
- assertTrue(field != null);
- assertTrue(DocHelper.nameValues.containsKey(field.name()));
- }
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-
- public void testDelete() {
- Document docToDelete = new Document();
- DocHelper.setupDoc(docToDelete);
- DocHelper.writeDoc(dir, "seg-to-delete", docToDelete);
- try {
- SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir));
- assertTrue(deleteReader != null);
- assertTrue(deleteReader.numDocs() == 1);
- deleteReader.delete(0);
- assertTrue(deleteReader.isDeleted(0) == true);
- assertTrue(deleteReader.hasDeletions() == true);
- assertTrue(deleteReader.numDocs() == 0);
- try {
- Document test = deleteReader.document(0);
- assertTrue(false);
- } catch (IllegalArgumentException e) {
- assertTrue(true);
- }
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-
- public void testGetFieldNameVariations() {
- Collection result = reader.getFieldNames();
- assertTrue(result != null);
- assertTrue(result.size() == 7);
- for (Iterator iter = result.iterator(); iter.hasNext();) {
- String s = (String) iter.next();
- //System.out.println("Name: " + s);
- assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
- }
- result = reader.getFieldNames(true);
- assertTrue(result != null);
- assertTrue(result.size() == 5);
- for (Iterator iter = result.iterator(); iter.hasNext();) {
- String s = (String) iter.next();
- assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
- }
-
- result = reader.getFieldNames(false);
- assertTrue(result != null);
- assertTrue(result.size() == 2);
- //Get all indexed fields that are storing term vectors
- result = reader.getIndexedFieldNames(true);
- assertTrue(result != null);
- assertTrue(result.size() == 2);
-
- result = reader.getIndexedFieldNames(false);
- assertTrue(result != null);
- assertTrue(result.size() == 3);
- }
-
- public void testTerms() {
- try {
- TermEnum terms = reader.terms();
- assertTrue(terms != null);
- while (terms.next() == true)
- {
- Term term = terms.term();
- assertTrue(term != null);
- //System.out.println("Term: " + term);
- String fieldValue = (String)DocHelper.nameValues.get(term.field());
- assertTrue(fieldValue.indexOf(term.text()) != -1);
- }
-
- TermDocs termDocs = reader.termDocs();
- assertTrue(termDocs != null);
- termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
- assertTrue(termDocs.next() == true);
-
- TermPositions positions = reader.termPositions();
- positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
- assertTrue(positions != null);
- assertTrue(positions.doc() == 0);
- assertTrue(positions.nextPosition() >= 0);
-
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-
- public void testNorms() {
- //TODO: Not sure how these work/should be tested
-/*
- try {
- byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY);
- System.out.println("Norms: " + norms);
- assertTrue(norms != null);
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
-*/
-
- }
-
- public void testTermVectors() {
- TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
- assertTrue(result != null);
- String [] terms = result.getTerms();
- int [] freqs = result.getTermFrequencies();
- assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3);
- for (int i = 0; i < terms.length; i++) {
- String term = terms[i];
- int freq = freqs[i];
- assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
- assertTrue(freq > 0);
- }
-
- TermFreqVector [] results = reader.getTermFreqVectors(0);
- assertTrue(results != null);
- assertTrue(results.length == 2);
- }
-
-}
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Enumeration;
+
+public class TestSegmentReader extends TestCase {
+ private RAMDirectory dir = new RAMDirectory();
+ private Document testDoc = new Document();
+ private SegmentReader reader = null;
+
+ public TestSegmentReader(String s) {
+ super(s);
+ }
+
+ //TODO: Setup the reader w/ multiple documents
+ protected void setUp() {
+
+ try {
+ DocHelper.setupDoc(testDoc);
+ DocHelper.writeDoc(dir, testDoc);
+ reader = new SegmentReader(new SegmentInfo("test", 1, dir));
+ } catch (IOException e) {
+
+ }
+ }
+
+ protected void tearDown() {
+
+ }
+
+ public void test() {
+ assertTrue(dir != null);
+ assertTrue(reader != null);
+ assertTrue(DocHelper.nameValues.size() > 0);
+ assertTrue(DocHelper.numFields(testDoc) == 6);
+ }
+
+ public void testDocument() {
+ try {
+ assertTrue(reader.numDocs() == 1);
+ assertTrue(reader.maxDoc() >= 1);
+ Document result = reader.document(0);
+ assertTrue(result != null);
+ //There are 2 unstored fields on the document that are not preserved across writing
+ assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2);
+
+ Enumeration fields = result.fields();
+ while (fields.hasMoreElements()) {
+ Field field = (Field) fields.nextElement();
+ assertTrue(field != null);
+ assertTrue(DocHelper.nameValues.containsKey(field.name()));
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testDelete() {
+ Document docToDelete = new Document();
+ DocHelper.setupDoc(docToDelete);
+ DocHelper.writeDoc(dir, "seg-to-delete", docToDelete);
+ try {
+ SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir));
+ assertTrue(deleteReader != null);
+ assertTrue(deleteReader.numDocs() == 1);
+ deleteReader.delete(0);
+ assertTrue(deleteReader.isDeleted(0) == true);
+ assertTrue(deleteReader.hasDeletions() == true);
+ assertTrue(deleteReader.numDocs() == 0);
+ try {
+ Document test = deleteReader.document(0);
+ assertTrue(false);
+ } catch (IllegalArgumentException e) {
+ assertTrue(true);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testGetFieldNameVariations() {
+ Collection result = reader.getFieldNames();
+ assertTrue(result != null);
+ assertTrue(result.size() == 7);
+ for (Iterator iter = result.iterator(); iter.hasNext();) {
+ String s = (String) iter.next();
+ //System.out.println("Name: " + s);
+ assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
+ }
+ result = reader.getFieldNames(true);
+ assertTrue(result != null);
+ assertTrue(result.size() == 5);
+ for (Iterator iter = result.iterator(); iter.hasNext();) {
+ String s = (String) iter.next();
+ assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
+ }
+
+ result = reader.getFieldNames(false);
+ assertTrue(result != null);
+ assertTrue(result.size() == 2);
+ //Get all indexed fields that are storing term vectors
+ result = reader.getIndexedFieldNames(true);
+ assertTrue(result != null);
+ assertTrue(result.size() == 2);
+
+ result = reader.getIndexedFieldNames(false);
+ assertTrue(result != null);
+ assertTrue(result.size() == 3);
+ }
+
+ public void testTerms() {
+ try {
+ TermEnum terms = reader.terms();
+ assertTrue(terms != null);
+ while (terms.next() == true)
+ {
+ Term term = terms.term();
+ assertTrue(term != null);
+ //System.out.println("Term: " + term);
+ String fieldValue = (String)DocHelper.nameValues.get(term.field());
+ assertTrue(fieldValue.indexOf(term.text()) != -1);
+ }
+
+ TermDocs termDocs = reader.termDocs();
+ assertTrue(termDocs != null);
+ termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
+ assertTrue(termDocs.next() == true);
+
+ TermPositions positions = reader.termPositions();
+ positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
+ assertTrue(positions != null);
+ assertTrue(positions.doc() == 0);
+ assertTrue(positions.nextPosition() >= 0);
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testNorms() {
+ //TODO: Not sure how these work/should be tested
+/*
+ try {
+ byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY);
+ System.out.println("Norms: " + norms);
+ assertTrue(norms != null);
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+*/
+
+ }
+
+ public void testTermVectors() {
+ TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
+ assertTrue(result != null);
+ String [] terms = result.getTerms();
+ int [] freqs = result.getTermFrequencies();
+ assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ int freq = freqs[i];
+ assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
+ assertTrue(freq > 0);
+ }
+
+ TermFreqVector [] results = reader.getTermFreqVectors(0);
+ assertTrue(results != null);
+ assertTrue(results.length == 2);
+ }
+
+}
Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java,v
retrieving revision 1.1
diff -u -r1.1 TestTermVectorsReader.java
--- src/test/org/apache/lucene/index/TestTermVectorsReader.java 20 Feb 2004 20:14:55 -0000 1.1
+++ src/test/org/apache/lucene/index/TestTermVectorsReader.java 8 Sep 2004 14:33:23 -0000
@@ -1,106 +1,218 @@
-package org.apache.lucene.index;
-
-
-import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
-
-import java.io.IOException;
-import java.util.Arrays;
-
-public class TestTermVectorsReader extends TestCase {
- private TermVectorsWriter writer = null;
- //Must be lexicographically sorted, will do in setup, versus trying to maintain here
- private String [] testFields = {"f1", "f2", "f3"};
- private String [] testTerms = {"this", "is", "a", "test"};
- private RAMDirectory dir = new RAMDirectory();
- private String seg = "testSegment";
- private FieldInfos fieldInfos = new FieldInfos();
-
- public TestTermVectorsReader(String s) {
- super(s);
- }
-
- protected void setUp() {
- for (int i = 0; i < testFields.length; i++) {
- fieldInfos.add(testFields[i], true, true);
- }
-
- try {
- Arrays.sort(testTerms);
- for (int j = 0; j < 5; j++) {
- writer = new TermVectorsWriter(dir, seg, fieldInfos);
- writer.openDocument();
-
- for (int k = 0; k < testFields.length; k++) {
- writer.openField(testFields[k]);
- for (int i = 0; i < testTerms.length; i++) {
- writer.addTerm(testTerms[i], i);
- }
- writer.closeField();
- }
- writer.closeDocument();
- writer.close();
- }
-
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-
- protected void tearDown() {
-
- }
-
- public void test() {
- //Check to see the files were created properly in setup
- assertTrue(writer.isDocumentOpen() == false);
- assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
- assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
- }
-
- public void testReader() {
- try {
- TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
- assertTrue(reader != null);
- TermFreqVector vector = reader.get(0, testFields[0]);
- assertTrue(vector != null);
- String [] terms = vector.getTerms();
- assertTrue(terms != null);
- assertTrue(terms.length == testTerms.length);
- for (int i = 0; i < terms.length; i++) {
- String term = terms[i];
- //System.out.println("Term: " + term);
- assertTrue(term.equals(testTerms[i]));
- }
-
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
- }
-
- /**
- * Make sure exceptions and bad params are handled appropriately
- */
- public void testBadParams() {
- try {
- TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
- assertTrue(reader != null);
- //Bad document number, good field number
- TermFreqVector vector = reader.get(50, testFields[0]);
- assertTrue(vector == null);
- } catch (Exception e) {
- assertTrue(false);
- }
- try {
- TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
- assertTrue(reader != null);
- //good document number, bad field number
- TermFreqVector vector = reader.get(0, "f50");
- assertTrue(vector == null);
- } catch (Exception e) {
- assertTrue(false);
- }
- }
-}
+package org.apache.lucene.index;
+
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+public class TestTermVectorsReader extends TestCase {
+ private TermVectorsWriter writer = null;
+ //Must be lexicographically sorted, will do in setup, versus trying to maintain here
+ private String [] testFields = {"f1", "f2", "f3"};
+ private boolean [] testFieldsStorePos = {true, false, true, false};
+ private boolean [] testFieldsStoreOff = {true, false, false, true};
+ private String [] testTerms = {"this", "is", "a", "test"};
+ private int [][] positions = new int[testTerms.length][];
+ private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
+ private RAMDirectory dir = new RAMDirectory();
+ private String seg = "testSegment";
+ private FieldInfos fieldInfos = new FieldInfos();
+
+ public TestTermVectorsReader(String s) {
+ super(s);
+ }
+
+ protected void setUp() {
+ for (int i = 0; i < testFields.length; i++) {
+ fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
+ }
+
+ for (int i = 0; i < testTerms.length; i++)
+ {
+ positions[i] = new int[3];
+ for (int j = 0; j < positions[i].length; j++) {
+ positions[i][j] = (int)(Math.random() * 1000);
+ }
+ offsets[i] = new TermVectorOffsetInfo[3];
+ for (int j = 0; j < offsets[i].length; j++){
+ offsets[i][j] = new TermVectorOffsetInfo(0, testTerms[i].length());
+ }
+ }
+ try {
+ Arrays.sort(testTerms);
+ for (int j = 0; j < 5; j++) {
+ writer = new TermVectorsWriter(dir, seg, fieldInfos);
+ writer.openDocument();
+
+ for (int k = 0; k < testFields.length; k++) {
+ writer.openField(testFields[k]);
+ for (int i = 0; i < testTerms.length; i++) {
+ writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
+ }
+ writer.closeField();
+ }
+ writer.closeDocument();
+ writer.close();
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ protected void tearDown() {
+
+ }
+
+ public void test() {
+ //Check to see the files were created properly in setup
+ assertTrue(writer.isDocumentOpen() == false);
+ assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
+ assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
+ }
+
+ public void testReader() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ TermFreqVector vector = reader.get(0, testFields[0]);
+ assertTrue(vector != null);
+ String [] terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testPositionReader() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ TermPositionVector vector;
+ String [] terms;
+ vector = (TermPositionVector)reader.get(0, testFields[0]);
+ assertTrue(vector != null);
+ terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ int [] positions = vector.getTermPositions(i);
+ assertTrue(positions != null);
+ assertTrue(positions.length == this.positions[i].length);
+ for (int j = 0; j < positions.length; j++) {
+ int position = positions[j];
+ assertTrue(position == this.positions[i][j]);
+ }
+ TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ assertTrue(offset != null);
+ assertTrue(offset.length == this.offsets[i].length);
+ for (int j = 0; j < offset.length; j++) {
+ TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
+ assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
+ }
+ }
+
+ TermFreqVector freqVector = (TermFreqVector)reader.get(0, testFields[1]); //no pos, no offset
+ assertTrue(freqVector != null);
+ assertTrue(freqVector instanceof TermPositionVector == false);
+ terms = freqVector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ }
+
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ catch (ClassCastException cce)
+ {
+ cce.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testOffsetReader() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
+ assertTrue(vector != null);
+ String [] terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ int [] positions = vector.getTermPositions(i);
+ assertTrue(positions != null);
+ assertTrue(positions.length == this.positions[i].length);
+ for (int j = 0; j < positions.length; j++) {
+ int position = positions[j];
+ assertTrue(position == this.positions[i][j]);
+ }
+ TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ assertTrue(offset != null);
+ assertTrue(offset.length == this.offsets[i].length);
+ for (int j = 0; j < offset.length; j++) {
+ TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
+ assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
+ }
+ }
+
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ catch (ClassCastException cce)
+ {
+ cce.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+
+ /**
+ * Make sure exceptions and bad params are handled appropriately
+ */
+ public void testBadParams() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ //Bad document number, good field number
+ TermFreqVector vector = reader.get(50, testFields[0]);
+ assertTrue(vector == null);
+ } catch (Exception e) {
+ assertTrue(false);
+ }
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ //good document number, bad field number
+ TermFreqVector vector = reader.get(0, "f50");
+ assertTrue(vector == null);
+ } catch (Exception e) {
+ assertTrue(false);
+ }
+ }
+}
cvs server: Diffing src/test/org/apache/lucene/index/store
cvs server: Diffing src/test/org/apache/lucene/queryParser
cvs server: Diffing src/test/org/apache/lucene/search
Index: src/test/org/apache/lucene/search/TestTermVectors.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/search/TestTermVectors.java,v
retrieving revision 1.4
diff -u -r1.4 TestTermVectors.java
--- src/test/org/apache/lucene/search/TestTermVectors.java 7 Sep 2004 18:26:36 -0000 1.4
+++ src/test/org/apache/lucene/search/TestTermVectors.java 8 Sep 2004 14:33:23 -0000
@@ -1,222 +1,298 @@
-package org.apache.lucene.search;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import junit.framework.TestCase;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.*;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.English;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-public class TestTermVectors extends TestCase {
- private IndexSearcher searcher;
- private RAMDirectory directory = new RAMDirectory();
- public TestTermVectors(String s) {
- super(s);
- }
-
- public void setUp() throws Exception {
- IndexWriter writer
- = new IndexWriter(directory, new SimpleAnalyzer(), true);
- //writer.setUseCompoundFile(true);
- //writer.infoStream = System.out;
- for (int i = 0; i < 1000; i++) {
- Document doc = new Document();
- doc.add(new Field("field", English.intToEnglish(i),
- Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
- writer.addDocument(doc);
- }
- writer.close();
- searcher = new IndexSearcher(directory);
- }
-
- protected void tearDown() {
-
- }
-
- public void test() {
- assertTrue(searcher != null);
- }
-
- public void testTermVectors() {
- Query query = new TermQuery(new Term("field", "seventy"));
- try {
- Hits hits = searcher.search(query);
- assertEquals(100, hits.length());
-
- for (int i = 0; i < hits.length(); i++)
- {
- TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
- assertTrue(vector != null);
- assertTrue(vector.length == 1);
- //assertTrue();
- }
- TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
- //System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
- //System.out.println("Vector: " + vector[0].toString());
- } catch (IOException e) {
- assertTrue(false);
- }
- }
-
- public void testTermPositionVectors() {
- Query query = new TermQuery(new Term("field", "fifty"));
- try {
- Hits hits = searcher.search(query);
- assertEquals(100, hits.length());
-
- for (int i = 0; i < hits.length(); i++)
- {
- TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
- assertTrue(vector != null);
- assertTrue(vector.length == 1);
- //assertTrue();
- }
- } catch (IOException e) {
- assertTrue(false);
- }
- }
-
- public void testKnownSetOfDocuments() {
- String test1 = "eating chocolate in a computer lab"; //6 terms
- String test2 = "computer in a computer lab"; //5 terms
- String test3 = "a chocolate lab grows old"; //5 terms
- String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
- Map test4Map = new HashMap();
- test4Map.put("chocolate", new Integer(3));
- test4Map.put("lab", new Integer(2));
- test4Map.put("eating", new Integer(1));
- test4Map.put("computer", new Integer(1));
- test4Map.put("with", new Integer(1));
- test4Map.put("a", new Integer(1));
- test4Map.put("colored", new Integer(1));
- test4Map.put("in", new Integer(1));
- test4Map.put("an", new Integer(1));
- test4Map.put("computer", new Integer(1));
- test4Map.put("old", new Integer(1));
-
- Document testDoc1 = new Document();
- setupDoc(testDoc1, test1);
- Document testDoc2 = new Document();
- setupDoc(testDoc2, test2);
- Document testDoc3 = new Document();
- setupDoc(testDoc3, test3);
- Document testDoc4 = new Document();
- setupDoc(testDoc4, test4);
-
- Directory dir = new RAMDirectory();
-
- try {
- IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
- assertTrue(writer != null);
- writer.addDocument(testDoc1);
- writer.addDocument(testDoc2);
- writer.addDocument(testDoc3);
- writer.addDocument(testDoc4);
- writer.close();
- IndexSearcher knownSearcher = new IndexSearcher(dir);
- TermEnum termEnum = knownSearcher.reader.terms();
- TermDocs termDocs = knownSearcher.reader.termDocs();
- //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
-
- Similarity sim = knownSearcher.getSimilarity();
- while (termEnum.next() == true)
- {
- Term term = termEnum.term();
- //System.out.println("Term: " + term);
- termDocs.seek(term);
- while (termDocs.next())
- {
- int docId = termDocs.doc();
- int freq = termDocs.freq();
- //System.out.println("Doc Id: " + docId + " freq " + freq);
- TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field");
- float tf = sim.tf(freq);
- float idf = sim.idf(term, knownSearcher);
- //float qNorm = sim.queryNorm()
- //This is fine since we don't have stop words
- float lNorm = sim.lengthNorm("field", vector.getTerms().length);
- //float coord = sim.coord()
- //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
- assertTrue(vector != null);
- String[] vTerms = vector.getTerms();
- int [] freqs = vector.getTermFrequencies();
- for (int i = 0; i < vTerms.length; i++)
- {
- if (term.text().equals(vTerms[i]) == true)
- {
- assertTrue(freqs[i] == freq);
- }
- }
-
- }
- //System.out.println("--------");
- }
- Query query = new TermQuery(new Term("field", "chocolate"));
- Hits hits = knownSearcher.search(query);
- //doc 3 should be the first hit b/c it is the shortest match
- assertTrue(hits.length() == 3);
- float score = hits.score(0);
- /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
- System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
- System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
- System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
- assertTrue(testDoc3.toString().equals(hits.doc(0).toString()));
- assertTrue(testDoc4.toString().equals(hits.doc(1).toString()));
- assertTrue(testDoc1.toString().equals(hits.doc(2).toString()));
- TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
- assertTrue(vector != null);
- //System.out.println("Vector: " + vector);
- String[] terms = vector.getTerms();
- int [] freqs = vector.getTermFrequencies();
- assertTrue(terms != null && terms.length == 10);
- for (int i = 0; i < terms.length; i++) {
- String term = terms[i];
- //System.out.println("Term: " + term);
- int freq = freqs[i];
- assertTrue(test4.indexOf(term) != -1);
- Integer freqInt = (Integer)test4Map.get(term);
- assertTrue(freqInt != null);
- assertTrue(freqInt.intValue() == freq);
- }
- knownSearcher.close();
- } catch (IOException e) {
- e.printStackTrace();
- assertTrue(false);
- }
-
-
- }
-
- private void setupDoc(Document doc, String text)
- {
- doc.add(new Field("field", text, Field.Store.YES,
- Field.Index.TOKENIZED, Field.TermVector.YES));
- //System.out.println("Document: " + doc);
- }
-
-
-}
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.English;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TestTermVectors extends TestCase {
+ private IndexSearcher searcher;
+ private RAMDirectory directory = new RAMDirectory();
+ public TestTermVectors(String s) {
+ super(s);
+ }
+
+ public void setUp() throws Exception {
+ IndexWriter writer
+ = new IndexWriter(directory, new SimpleAnalyzer(), true);
+ //writer.setUseCompoundFile(true);
+ //writer.infoStream = System.out;
+ for (int i = 0; i < 1000; i++) {
+ Document doc = new Document();
+ Field.TermVector termVector;
+ int mod3 = i % 3;
+ int mod2 = i % 2;
+ if (mod2 == 0 && mod3 == 0){
+ termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+ }
+ else if (mod2 == 0){
+ termVector = Field.TermVector.WITH_POSITIONS;
+ }
+ else if (mod3 == 0){
+ termVector = Field.TermVector.WITH_POSITIONS;
+ }
+ else {
+ termVector = Field.TermVector.YES;
+ }
+ doc.add(new Field("field", English.intToEnglish(i),
+ Field.Store.YES, Field.Index.TOKENIZED, termVector));
+ writer.addDocument(doc);
+ }
+ writer.close();
+ searcher = new IndexSearcher(directory);
+ }
+
+ protected void tearDown() {
+
+ }
+
+ public void test() {
+ assertTrue(searcher != null);
+ }
+
+ public void testTermVectors() {
+ Query query = new TermQuery(new Term("field", "seventy"));
+ try {
+ Hits hits = searcher.search(query);
+ assertEquals(100, hits.length());
+
+ for (int i = 0; i < hits.length(); i++)
+ {
+ TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
+ assertTrue(vector != null);
+ assertTrue(vector.length == 1);
+ //assertTrue();
+ }
+ TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
+ //System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
+ //System.out.println("Vector: " + vector[0].toString());
+ } catch (IOException e) {
+ assertTrue(false);
+ }
+ }
+
+ public void testTermPositionVectors() {
+ Query query = new TermQuery(new Term("field", "zero"));
+ try {
+ Hits hits = searcher.search(query);
+ assertEquals(1, hits.length());
+
+ for (int i = 0; i < hits.length(); i++)
+ {
+ TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
+ assertTrue(vector != null);
+ assertTrue(vector.length == 1);
+ boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true : false;
+ assertTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true)));
+ if (shouldBePosVector == true)
+ {
+ TermPositionVector posVec = (TermPositionVector)vector[0];
+ String [] terms = posVec.getTerms();
+ assertTrue(terms != null && terms.length > 0);
+ for (int j = 0; j < terms.length; j++) {
+ int [] positions = posVec.getTermPositions(j);
+ assertTrue(positions != null);
+ assertTrue(positions.length > 0);
+ }
+ }
+ boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true : false;
+ if (shouldBeOffVector == true)
+ {
+ TermPositionVector posVec = (TermPositionVector)vector[0];
+ String [] terms = posVec.getTerms();
+ assertTrue(terms != null && terms.length > 0);
+ for (int j = 0; j < terms.length; j++) {
+ String term = terms[j];
+ TermVectorOffsetInfo [] offsets = posVec.getOffsets(j);
+ assertTrue(offsets != null);
+ assertTrue(offsets.length > 0);
+ }
+ }
+ boolean shouldBeBothVector = (hits.id(i) % 6 == 0) ? true : false;
+ //System.out.println("Hit Id: " + hits.id(i));
+ if (shouldBeBothVector == true)
+ {
+ TermPositionVector posVec = (TermPositionVector)vector[0];
+ String [] terms = posVec.getTerms();
+ assertTrue(terms != null && terms.length > 0);
+ for (int j = 0; j < terms.length; j++) {
+ TermVectorOffsetInfo [] offsets = posVec.getOffsets(j);
+ assertTrue(offsets != null);
+ assertTrue(offsets.length > 0);
+ int [] positions = posVec.getTermPositions(j);
+ assertTrue(positions != null);
+ assertTrue(positions.length > 0);
+ }
+ }
+ //assertTrue();
+ }
+ } catch (IOException e) {
+ assertTrue(false);
+ }
+ }
+
+ public void testTermOffsetVectors() {
+ Query query = new TermQuery(new Term("field", "fifty"));
+ try {
+ Hits hits = searcher.search(query);
+ assertEquals(100, hits.length());
+
+ for (int i = 0; i < hits.length(); i++)
+ {
+ TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
+ assertTrue(vector != null);
+ assertTrue(vector.length == 1);
+
+ //assertTrue();
+ }
+ } catch (IOException e) {
+ assertTrue(false);
+ }
+ }
+
+ public void testKnownSetOfDocuments() {
+ String test1 = "eating chocolate in a computer lab"; //6 terms
+ String test2 = "computer in a computer lab"; //5 terms
+ String test3 = "a chocolate lab grows old"; //5 terms
+ String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
+ Map test4Map = new HashMap();
+ test4Map.put("chocolate", new Integer(3));
+ test4Map.put("lab", new Integer(2));
+ test4Map.put("eating", new Integer(1));
+ test4Map.put("computer", new Integer(1));
+ test4Map.put("with", new Integer(1));
+ test4Map.put("a", new Integer(1));
+ test4Map.put("colored", new Integer(1));
+ test4Map.put("in", new Integer(1));
+ test4Map.put("an", new Integer(1));
+ test4Map.put("computer", new Integer(1));
+ test4Map.put("old", new Integer(1));
+
+ Document testDoc1 = new Document();
+ setupDoc(testDoc1, test1);
+ Document testDoc2 = new Document();
+ setupDoc(testDoc2, test2);
+ Document testDoc3 = new Document();
+ setupDoc(testDoc3, test3);
+ Document testDoc4 = new Document();
+ setupDoc(testDoc4, test4);
+
+ Directory dir = new RAMDirectory();
+
+ try {
+ IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
+ assertTrue(writer != null);
+ writer.addDocument(testDoc1);
+ writer.addDocument(testDoc2);
+ writer.addDocument(testDoc3);
+ writer.addDocument(testDoc4);
+ writer.close();
+ IndexSearcher knownSearcher = new IndexSearcher(dir);
+ TermEnum termEnum = knownSearcher.reader.terms();
+ TermDocs termDocs = knownSearcher.reader.termDocs();
+ //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
+
+ Similarity sim = knownSearcher.getSimilarity();
+ while (termEnum.next() == true)
+ {
+ Term term = termEnum.term();
+ //System.out.println("Term: " + term);
+ termDocs.seek(term);
+ while (termDocs.next())
+ {
+ int docId = termDocs.doc();
+ int freq = termDocs.freq();
+ //System.out.println("Doc Id: " + docId + " freq " + freq);
+ TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field");
+ float tf = sim.tf(freq);
+ float idf = sim.idf(term, knownSearcher);
+ //float qNorm = sim.queryNorm()
+ //This is fine since we don't have stop words
+ float lNorm = sim.lengthNorm("field", vector.getTerms().length);
+ //float coord = sim.coord()
+ //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
+ assertTrue(vector != null);
+ String[] vTerms = vector.getTerms();
+ int [] freqs = vector.getTermFrequencies();
+ for (int i = 0; i < vTerms.length; i++)
+ {
+ if (term.text().equals(vTerms[i]) == true)
+ {
+ assertTrue(freqs[i] == freq);
+ }
+ }
+
+ }
+ //System.out.println("--------");
+ }
+ Query query = new TermQuery(new Term("field", "chocolate"));
+ Hits hits = knownSearcher.search(query);
+ //doc 3 should be the first hit b/c it is the shortest match
+ assertTrue(hits.length() == 3);
+ float score = hits.score(0);
+ /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
+ System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
+ System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
+ System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
+ System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
+ System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
+ assertTrue(hits.id(0) == 2);
+ assertTrue(hits.id(1) == 3);
+ assertTrue(hits.id(2) == 0);
+ TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
+ assertTrue(vector != null);
+ //System.out.println("Vector: " + vector);
+ String[] terms = vector.getTerms();
+ int [] freqs = vector.getTermFrequencies();
+ assertTrue(terms != null && terms.length == 10);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ int freq = freqs[i];
+ assertTrue(test4.indexOf(term) != -1);
+ Integer freqInt = (Integer)test4Map.get(term);
+ assertTrue(freqInt != null);
+ assertTrue(freqInt.intValue() == freq);
+ }
+ knownSearcher.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+
+
+ }
+
+ private void setupDoc(Document doc, String text)
+ {
+ doc.add(new Field("field", text, Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.YES));
+ //System.out.println("Document: " + doc);
+ }
+
+
+}
cvs server: Diffing src/test/org/apache/lucene/search/spans
cvs server: Diffing src/test/org/apache/lucene/store
cvs server: Diffing src/test/org/apache/lucene/util
cvs server: Diffing xdocs
cvs server: Diffing xdocs/images
cvs server: Diffing xdocs/lucene-sandbox
cvs server: Diffing xdocs/lucene-sandbox/larm
cvs server: Diffing xdocs/stylesheets