? build ? dist ? META-INF ? prj ? src/java/org/apache/lucene/index/SegmentTermPositionVector.java ? src/java/org/apache/lucene/index/TermVectorOffsetInfo.java cvs server: Diffing . cvs server: Diffing docs cvs server: Diffing docs/images cvs server: Diffing docs/lucene-sandbox cvs server: Diffing docs/lucene-sandbox/larm cvs server: Diffing lib cvs server: Diffing src cvs server: Diffing src/demo cvs server: Diffing src/demo/org cvs server: Diffing src/demo/org/apache cvs server: Diffing src/demo/org/apache/lucene cvs server: Diffing src/demo/org/apache/lucene/demo cvs server: Diffing src/demo/org/apache/lucene/demo/html cvs server: Diffing src/java cvs server: Diffing src/java/org cvs server: Diffing src/java/org/apache cvs server: Diffing src/java/org/apache/lucene cvs server: Diffing src/java/org/apache/lucene/analysis cvs server: Diffing src/java/org/apache/lucene/analysis/de cvs server: Diffing src/java/org/apache/lucene/analysis/ru cvs server: Diffing src/java/org/apache/lucene/analysis/standard cvs server: Diffing src/java/org/apache/lucene/document Index: src/java/org/apache/lucene/document/Field.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v retrieving revision 1.22 diff -u -r1.22 Field.java --- src/java/org/apache/lucene/document/Field.java 1 Sep 2004 22:11:07 -0000 1.22 +++ src/java/org/apache/lucene/document/Field.java 8 Sep 2004 14:33:21 -0000 @@ -16,11 +16,12 @@ * limitations under the License. */ +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Similarity; + import java.io.Reader; -import java.util.Date; -import org.apache.lucene.index.IndexReader; // for javadoc -import org.apache.lucene.search.Similarity; // for javadoc -import org.apache.lucene.search.Hits; // for javadoc +import java.util.Date; // for javadoc /** A field is a section of a Document. Each field has two parts, a name and a @@ -34,6 +35,8 @@ private String name = "body"; private String stringValue = null; private boolean storeTermVector = false; + private boolean storeOffsetWithTermVector = false; + private boolean storePositionWithTermVector = false; private Reader readerValue = null; private boolean isStored = false; private boolean isIndexed = true; @@ -86,11 +89,22 @@ } public static final class TermVector { + public static final boolean HAS_POSITIONS = true; + public static final boolean HAS_OFFSETS = true; private String name; + private boolean storePositions = false; + private boolean storeOffsets = false; private TermVector() {} private TermVector(String name) { this.name = name; } + + private TermVector(String name, boolean storeOffsets, boolean storePositions) { + this.name = name; + this.storeOffsets = storeOffsets; + this.storePositions = storePositions; + } + public String toString() { return name; } @@ -100,6 +114,26 @@ /** Store the term vectors of each document. A term vector is a list * of the document's terms and their number of occurences in that document. */ public static final TermVector YES = new TermVector("YES"); + /** + * Store the term vector + token position information + * + * @see #YES + */ + public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS", HAS_POSITIONS, !HAS_OFFSETS); + /** + * Store the term vector + Token offset information + * + * @see #YES + */ + public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS", !HAS_POSITIONS, HAS_OFFSETS); + /** + * Store the term vector + Token position and offset information + * + * @see #YES + * @see #WITH_POSITIONS + * @see #WITH_OFFSETS + */ + public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS", HAS_POSITIONS, HAS_OFFSETS); } /** Sets the boost factor hits on this field. This value will be @@ -373,7 +407,25 @@ this.storeTermVector = false; } else if (termVector == TermVector.YES) { this.storeTermVector = true; - } else { + this.storePositionWithTermVector = termVector.storePositions; + this.storeOffsetWithTermVector = termVector.storeOffsets; + } + else if (termVector == TermVector.WITH_POSITIONS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = false; + } + else if (termVector == TermVector.WITH_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = true; + } + else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = true; + } + else { throw new IllegalArgumentException("unknown termVector parameter " + termVector); } } @@ -402,6 +454,14 @@ */ public final boolean isTermVectorStored() { return storeTermVector; } + public boolean isStoreOffsetWithTermVector(){ + return storeOffsetWithTermVector; + } + + public boolean isStorePositionWithTermVector(){ + return storePositionWithTermVector; + } + /** Prints a Field for human consumption. */ public final String toString() { StringBuffer result = new StringBuffer(); @@ -422,6 +482,16 @@ result.append(","); result.append("termVector"); } + if (storeOffsetWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorOffsets"); + } + if (storePositionWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorPosition"); + } result.append('<'); result.append(name); result.append(':'); cvs server: Diffing src/java/org/apache/lucene/index Index: src/java/org/apache/lucene/index/DocumentWriter.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v retrieving revision 1.12 diff -u -r1.12 DocumentWriter.java --- src/java/org/apache/lucene/index/DocumentWriter.java 10 Jul 2004 06:19:01 -0000 1.12 +++ src/java/org/apache/lucene/index/DocumentWriter.java 8 Sep 2004 14:33:21 -0000 @@ -16,21 +16,21 @@ * limitations under the License. */ -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.Hashtable; -import java.util.Enumeration; -import java.util.Arrays; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.OutputStream; -import org.apache.lucene.search.Similarity; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; +import java.util.Enumeration; +import java.util.Hashtable; final class DocumentWriter { private Analyzer analyzer; @@ -125,7 +125,7 @@ if (field.isIndexed()) { if (!field.isTokenized()) { // un-tokenized field - addPosition(fieldName, field.stringValue(), position++); + addPosition(fieldName, field.stringValue(), position++, new TermVectorOffsetInfo(-1,-1)); length++; } else { Reader reader; // find or make Reader @@ -142,7 +142,7 @@ try { for (Token t = stream.next(); t != null; t = stream.next()) { position += (t.getPositionIncrement() - 1); - addPosition(fieldName, t.termText(), position++); + addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(t.startOffset(), t.endOffset())); if (++length > maxFieldLength) break; } } finally { @@ -159,8 +159,9 @@ private final Term termBuffer = new Term("", ""); // avoid consing - private final void addPosition(String field, String text, int position) { + private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { termBuffer.set(field, text); + //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable.get(termBuffer); if (ti != null) { // word seen before int freq = ti.freq; @@ -172,10 +173,23 @@ ti.positions = newPositions; } ti.positions[freq] = position; // add new position + + if (offset != null) { + if (ti.offsets.length == freq){ + TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; + TermVectorOffsetInfo [] offsets = ti.offsets; + for (int i = 0; i < freq; i++) + { + newOffsets[i] = offsets[i]; + } + ti.offsets = newOffsets; + } + ti.offsets[freq] = offset; + } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); - postingTable.put(term, new Posting(term, position)); + postingTable.put(term, new Posting(term, position, offset)); } } @@ -294,12 +308,13 @@ termVectorWriter.openDocument(); } termVectorWriter.openField(currentField); + } else if (termVectorWriter != null) { termVectorWriter.closeField(); } } if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { - termVectorWriter.addTerm(posting.term.text(), postingFreq); + termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) @@ -336,11 +351,14 @@ Term term; // the Term int freq; // its frequency in doc int[] positions; // positions it occurs at + TermVectorOffsetInfo [] offsets; - Posting(Term t, int position) { + Posting(Term t, int position, TermVectorOffsetInfo offset) { term = t; freq = 1; positions = new int[1]; positions[0] = position; + offsets = new TermVectorOffsetInfo[1]; + offsets[0] = offset; } } Index: src/java/org/apache/lucene/index/FieldInfo.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfo.java,v retrieving revision 1.3 diff -u -r1.3 FieldInfo.java --- src/java/org/apache/lucene/index/FieldInfo.java 29 Mar 2004 22:48:02 -0000 1.3 +++ src/java/org/apache/lucene/index/FieldInfo.java 8 Sep 2004 14:33:21 -0000 @@ -23,11 +23,16 @@ // true if term vector for this field should be stored boolean storeTermVector; + boolean storeOffsetWithTermVector = false; + boolean storePositionWithTermVector = false; - FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) { + FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { name = na; isIndexed = tk; number = nu; this.storeTermVector = storeTermVector; + this.storeOffsetWithTermVector = storeOffsetWithTermVector; + this.storePositionWithTermVector = storePositionWithTermVector; } } Index: src/java/org/apache/lucene/index/FieldInfos.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v retrieving revision 1.10 diff -u -r1.10 FieldInfos.java --- src/java/org/apache/lucene/index/FieldInfos.java 25 Aug 2004 12:06:14 -0000 1.10 +++ src/java/org/apache/lucene/index/FieldInfos.java 8 Sep 2004 14:33:21 -0000 @@ -16,15 +16,14 @@ * limitations under the License. */ -import java.util.*; -import java.io.IOException; - import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; - import org.apache.lucene.store.Directory; -import org.apache.lucene.store.OutputStream; import org.apache.lucene.store.InputStream; +import org.apache.lucene.store.OutputStream; + +import java.io.IOException; +import java.util.*; /** Access to the Field Info file that describes document fields and whether or * not they are indexed. Each segment has a separate Field Info file. Objects @@ -61,7 +60,8 @@ Enumeration fields = doc.fields(); while (fields.hasMoreElements()) { Field field = (Field) fields.nextElement(); - add(field.name(), field.isIndexed(), field.isTermVectorStored()); + add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), + field.isStoreOffsetWithTermVector()); } } @@ -69,10 +69,11 @@ * @param names The names of the fields * @param storeTermVectors Whether the fields store term vectors or not */ - public void addIndexed(Collection names, boolean storeTermVectors) { + public void addIndexed(Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector) { Iterator i = names.iterator(); while (i.hasNext()) { - add((String)i.next(), true, storeTermVectors); + add((String)i.next(), true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector); } } @@ -94,13 +95,15 @@ * Calls three parameter add with false for the storeTermVector parameter * @param name The name of the Field * @param isIndexed true if the field is indexed - * @see #add(String, boolean, boolean) + * @see #add(String, boolean, boolean, boolean, boolean) */ public void add(String name, boolean isIndexed) { - add(name, isIndexed, false); + add(name, isIndexed, false, false, false); } - + public void add(String name, boolean isIndexed, boolean storeTermVector){ + add(name, isIndexed, storeTermVector, false, false); + } /** If the field is not yet known, adds it. If it is known, checks to make * sure that the isIndexed flag is the same as was given previously for this * field. If not - marks it as being indexed. Same goes for storeTermVector @@ -109,10 +112,11 @@ * @param isIndexed true if the field is indexed * @param storeTermVector true if the term vector should be stored */ - public void add(String name, boolean isIndexed, boolean storeTermVector) { + public void add(String name, boolean isIndexed, boolean storeTermVector, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { FieldInfo fi = fieldInfo(name); if (fi == null) { - addInternal(name, isIndexed, storeTermVector); + addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector); } else { if (fi.isIndexed != isIndexed) { fi.isIndexed = true; // once indexed, always index @@ -120,13 +124,21 @@ if (fi.storeTermVector != storeTermVector) { fi.storeTermVector = true; // once vector, always vector } + if (fi.storePositionWithTermVector != storePositionWithTermVector) { + fi.storePositionWithTermVector = true; // once vector, always vector + } + if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) { + fi.storeOffsetWithTermVector = true; // once vector, always vector + } } } private void addInternal(String name, boolean isIndexed, - boolean storeTermVector) { + boolean storeTermVector, boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector) { FieldInfo fi = - new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector); + new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, + storeOffsetWithTermVector); byNumber.add(fi); byName.put(name, fi); } @@ -182,6 +194,8 @@ byte bits = 0x0; if (fi.isIndexed) bits |= 0x1; if (fi.storeTermVector) bits |= 0x2; + if (fi.storePositionWithTermVector) bits |= 0x4; + if (fi.storeOffsetWithTermVector) bits |= 0x8; output.writeString(fi.name); //Was REMOVE //output.writeByte((byte)(fi.isIndexed ? 1 : 0)); @@ -196,7 +210,9 @@ byte bits = input.readByte(); boolean isIndexed = (bits & 0x1) != 0; boolean storeTermVector = (bits & 0x2) != 0; - addInternal(name, isIndexed, storeTermVector); + boolean storePositionsWithTermVector = (bits & 0x4) != 0; + boolean storeOffsetWithTermVector = (bits & 0x8) != 0; + addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector); } } Index: src/java/org/apache/lucene/index/FieldsReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v retrieving revision 1.8 diff -u -r1.8 FieldsReader.java --- src/java/org/apache/lucene/index/FieldsReader.java 1 Sep 2004 20:04:12 -0000 1.8 +++ src/java/org/apache/lucene/index/FieldsReader.java 8 Sep 2004 14:33:21 -0000 @@ -16,12 +16,12 @@ * limitations under the License. */ -import java.io.IOException; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.InputStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.InputStream; + +import java.io.IOException; /** * Class responsible for access to stored document fields. @@ -75,12 +75,27 @@ index = Field.Index.UN_TOKENIZED; else index = Field.Index.NO; + Field.TermVector termVector = Field.TermVector.NO; + if (fi.storeTermVector == true){ + if (fi.storePositionWithTermVector == true && fi.storeOffsetWithTermVector == true) { + termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; + } + else if (fi.storePositionWithTermVector == true && fi.storeOffsetWithTermVector == false) { + termVector = Field.TermVector.WITH_POSITIONS; + } + else if (fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == true) { + termVector = Field.TermVector.WITH_OFFSETS; + } + } + else{ + termVector = Field.TermVector.NO; + } + //termVector = fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO; doc.add(new Field(fi.name, // name fieldsStream.readString(), // read value Field.Store.YES, index, - fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO)); + termVector)); } - return doc; } } Index: src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java,v retrieving revision 1.12 diff -u -r1.12 FilterIndexReader.java --- src/java/org/apache/lucene/index/FilterIndexReader.java 14 Jun 2004 00:15:24 -0000 1.12 +++ src/java/org/apache/lucene/index/FilterIndexReader.java 8 Sep 2004 14:33:21 -0000 @@ -16,11 +16,11 @@ * limitations under the License. */ +import org.apache.lucene.document.Document; + import java.io.IOException; import java.util.Collection; -import org.apache.lucene.document.Document; - /** A FilterIndexReader contains another IndexReader, which it * uses as its basic source of data, possibly transforming the data along the * way or providing additional functionality. The class @@ -145,5 +145,9 @@ */ public Collection getIndexedFieldNames(boolean storedTermVector) { return in.getIndexedFieldNames(storedTermVector); + } + + public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { + return in.getTermVectorFieldNames(storePositionWithTermVector, storeOffsetWithTermVector); } } Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v retrieving revision 1.35 diff -u -r1.35 IndexReader.java --- src/java/org/apache/lucene/index/IndexReader.java 15 Aug 2004 20:49:30 -0000 1.35 +++ src/java/org/apache/lucene/index/IndexReader.java 8 Sep 2004 14:33:21 -0000 @@ -16,16 +16,16 @@ * limitations under the License. */ -import java.io.IOException; -import java.io.File; -import java.util.Collection; - +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Lock; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; // for javadoc -import org.apache.lucene.search.Similarity; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, @@ -554,6 +554,26 @@ * @return Collection of Strings indicating the names of the fields */ public abstract Collection getIndexedFieldNames(boolean storedTermVector); + + /** + * Get all FieldNames that have term vector information, as well as position and/or offset information + * @param storePositionWithTermVector + * @param storeOffsetWithTermVector + * @return + */ + public abstract Collection getTermVectorFieldNames(boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector); + + /** + * + * @param storedTermVector + * @return Collection of Strings indicating the names of the fields + * @see #getIndexedFieldNames(boolean, boolean, boolean) with the last two as false and false + */ + /*public Collection getIndexedFieldNames(boolean storedTermVector) + { + return getIndexedFieldNames(storedTermVector, false, false); + }*/ /** * Returns true iff the index in the named directory is Index: src/java/org/apache/lucene/index/MultiReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/MultiReader.java,v retrieving revision 1.8 diff -u -r1.8 MultiReader.java --- src/java/org/apache/lucene/index/MultiReader.java 6 Aug 2004 20:50:29 -0000 1.8 +++ src/java/org/apache/lucene/index/MultiReader.java 8 Sep 2004 14:33:21 -0000 @@ -16,16 +16,12 @@ * limitations under the License. */ -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.Set; - import org.apache.lucene.document.Document; import org.apache.lucene.store.Directory; +import java.io.IOException; +import java.util.*; + /** An IndexReader which reads multiple indexes, appending their content. * * @version $Id: MultiReader.java,v 1.8 2004/08/06 20:50:29 dnaber Exp $ @@ -248,6 +244,17 @@ for (int i = 0; i < subReaders.length; i++) { IndexReader reader = subReaders[i]; Collection names = reader.getIndexedFieldNames(storedTermVector); + fieldSet.addAll(names); + } + return fieldSet; + } + + public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { + // maintain a unique set of field names + Set fieldSet = new HashSet(); + for (int i = 0; i < subReaders.length; i++) { + IndexReader reader = subReaders[i]; + Collection names = reader.getTermVectorFieldNames(storePositionWithTermVector, storeOffsetWithTermVector); fieldSet.addAll(names); } return fieldSet; Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v retrieving revision 1.14 diff -u -r1.14 SegmentMerger.java --- src/java/org/apache/lucene/index/SegmentMerger.java 15 Aug 2004 11:26:05 -0000 1.14 +++ src/java/org/apache/lucene/index/SegmentMerger.java 8 Sep 2004 14:33:22 -0000 @@ -16,14 +16,14 @@ * limitations under the License. */ -import java.util.Vector; -import java.util.Iterator; -import java.io.IOException; - import org.apache.lucene.store.Directory; import org.apache.lucene.store.OutputStream; import org.apache.lucene.store.RAMOutputStream; +import java.io.IOException; +import java.util.Iterator; +import java.util.Vector; + /** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, * into a single Segment. After adding the appropriate readers, call the merge method to combine the @@ -157,8 +157,11 @@ int docCount = 0; for (int i = 0; i < readers.size(); i++) { IndexReader reader = (IndexReader) readers.elementAt(i); - fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true); - fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false); + //Can only store position and offset information when storing term vectors + fieldInfos.addIndexed(reader.getTermVectorFieldNames(true, true), true, true, true); + fieldInfos.addIndexed(reader.getTermVectorFieldNames(true, false), true, true, false); + fieldInfos.addIndexed(reader.getTermVectorFieldNames(false, false), true, false, false); + fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false, false, false); fieldInfos.add(reader.getFieldNames(false), false); } fieldInfos.write(directory, segment + ".fnm"); @@ -211,9 +214,15 @@ termVectorsWriter.openField(termVector.getField()); String [] terms = termVector.getTerms(); int [] freqs = termVector.getTermFrequencies(); + boolean positionVector = termVector instanceof TermPositionVector ? true : false; for (int t = 0; t < terms.length; t++) { - termVectorsWriter.addTerm(terms[t], freqs[t]); + if (positionVector == false) { + termVectorsWriter.addTerm(terms[t], freqs[t]); + } else { + termVectorsWriter.addTerm(terms[t], freqs[t], ((TermPositionVector)termVector).getTermPositions(t), + ((TermPositionVector)termVector).getOffsets(t)); + } } } termVectorsWriter.closeDocument(); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java,v retrieving revision 1.26 diff -u -r1.26 SegmentReader.java --- src/java/org/apache/lucene/index/SegmentReader.java 17 Aug 2004 08:56:08 -0000 1.26 +++ src/java/org/apache/lucene/index/SegmentReader.java 8 Sep 2004 14:33:22 -0000 @@ -16,20 +16,15 @@ * limitations under the License. */ -import java.io.IOException; -import java.util.Collection; -import java.util.Enumeration; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Set; -import java.util.Vector; - import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.InputStream; import org.apache.lucene.store.OutputStream; -import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitVector; +import java.io.IOException; +import java.util.*; + /** * FIXME: Describe class SegmentReader here. * @@ -325,6 +320,26 @@ } return fieldSet; + } + + /** + * Get all FieldNames that have term vector information, as well as position and/or offset information + * + * @param storePositionWithTermVector + * @param storeOffsetWithTermVector + * @return + */ + public Collection getTermVectorFieldNames(boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { + // maintain a unique set of field names + Set fieldSet = new HashSet(); + for (int i = 0; i < fieldInfos.size(); i++) { + FieldInfo fi = fieldInfos.fieldInfo(i); + if (fi.isIndexed == true && fi.storeTermVector == true && fi.storePositionWithTermVector == storePositionWithTermVector + && fi.storeOffsetWithTermVector == storeOffsetWithTermVector){ + fieldSet.add(fi.name); + } + } + return fieldSet; } public synchronized byte[] norms(String field) throws IOException { Index: src/java/org/apache/lucene/index/SegmentTermVector.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermVector.java,v retrieving revision 1.3 diff -u -r1.3 SegmentTermVector.java --- src/java/org/apache/lucene/index/SegmentTermVector.java 10 Jul 2004 06:19:01 -0000 1.3 +++ src/java/org/apache/lucene/index/SegmentTermVector.java 8 Sep 2004 14:33:22 -0000 @@ -4,9 +4,9 @@ /** */ class SegmentTermVector implements TermFreqVector { - private String field; - private String terms[]; - private int termFreqs[]; + protected String field; + protected String terms[]; + protected int termFreqs[]; SegmentTermVector(String field, String terms[], int termFreqs[]) { this.field = field; Index: src/java/org/apache/lucene/index/TermPositionVector.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermPositionVector.java,v retrieving revision 1.1 diff -u -r1.1 TermPositionVector.java --- src/java/org/apache/lucene/index/TermPositionVector.java 20 Feb 2004 20:14:55 -0000 1.1 +++ src/java/org/apache/lucene/index/TermPositionVector.java 8 Sep 2004 14:33:22 -0000 @@ -4,10 +4,20 @@ * positions in which each of the terms is found. */ public interface TermPositionVector extends TermFreqVector { - + /** Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the - * term number array obtained from getTermNumbers method. + * term String array obtained from the indexOf method. */ public int[] getTermPositions(int index); + + /** + * Returns an array of TermVectorOffsetInfo in which the term is found. + * + * @see org.apache.lucene.analysis.Token + * + * @param index The position in the array to get the offsets from + * @return An array of TermVectorOffsetInfo objects or the empty list + */ + public TermVectorOffsetInfo [] getOffsets(int index); } Index: src/java/org/apache/lucene/index/TermVectorsReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermVectorsReader.java,v retrieving revision 1.3 diff -u -r1.3 TermVectorsReader.java --- src/java/org/apache/lucene/index/TermVectorsReader.java 17 Aug 2004 20:53:16 -0000 1.3 +++ src/java/org/apache/lucene/index/TermVectorsReader.java 8 Sep 2004 14:33:22 -0000 @@ -210,11 +210,16 @@ if (numTerms == 0) return new SegmentTermVector(field, null, null); tvf.readVInt(); - + byte storePosByte = tvf.readByte(); + byte storeOffByte = tvf.readByte(); + String terms[] = new String[numTerms]; int termFreqs[] = new int[numTerms]; - + int positions[][]; + TermVectorOffsetInfo offsets[][];//we may not need these, but declare them + positions = new int[numTerms][]; + offsets = new TermVectorOffsetInfo[numTerms][]; int start = 0; int deltaLength = 0; int totalLength = 0; @@ -234,8 +239,36 @@ terms[i] = new String(buffer, 0, totalLength); previousString = terms[i]; termFreqs[i] = tvf.readVInt(); + //Next byte is whether we are storing position info, if 1, then we are. + byte storingInfo = tvf.readByte(); + if (storePosByte == 1 && storingInfo == 1)//should only be 1 when storePosInfo is 1 + { //read in the positions + int numPositions = tvf.readVInt(); + int [] pos = new int[numPositions]; + positions[i] = pos; + for (int j = 0; j < numPositions; j++) + { + pos[j] = tvf.readVInt(); + } + } + storingInfo = tvf.readByte(); + if (storeOffByte == 1 && storingInfo == 1) + { + int numOffsets = tvf.readVInt(); + TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[numOffsets]; + offsets[i] = offs; + for (int j = 0; j < numOffsets; j++) { + offs[j] = new TermVectorOffsetInfo(tvf.readVInt(), tvf.readVInt()); + } + } + } + SegmentTermVector tv; + if (storePosByte == 1 || storeOffByte == 1){ + tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); + } + else { + tv = new SegmentTermVector(field, terms, termFreqs); } - SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs); return tv; } Index: src/java/org/apache/lucene/index/TermVectorsWriter.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java,v retrieving revision 1.2 diff -u -r1.2 TermVectorsWriter.java --- src/java/org/apache/lucene/index/TermVectorsWriter.java 17 Aug 2004 20:53:16 -0000 1.2 +++ src/java/org/apache/lucene/index/TermVectorsWriter.java 8 Sep 2004 14:33:22 -0000 @@ -65,16 +65,9 @@ private TVField currentField = null; private long currentDocPointer = -1; - - /** Create term vectors writer for the specified segment in specified - * directory. A new TermVectorsWriter should be created for each - * segment. The parameter maxFields indicates how many total - * fields are found in this document. Not all of these fields may require - * termvectors to be stored, so the number of calls to - * openField is less or equal to this number. - */ - public TermVectorsWriter(Directory directory, String segment, - FieldInfos fieldInfos) + + + public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) throws IOException { // Open files for TermVector storage tvx = directory.createFile(segment + TVX_EXTENSION); @@ -83,12 +76,12 @@ tvd.writeInt(FORMAT_VERSION); tvf = directory.createFile(segment + TVF_EXTENSION); tvf.writeInt(FORMAT_VERSION); - this.fieldInfos = fieldInfos; fields = new Vector(fieldInfos.size()); terms = new Vector(); } + public final void openDocument() throws IOException { @@ -124,7 +117,9 @@ if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open."); closeField(); - currentField = new TVField(fieldInfos.fieldNumber(field)); + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + currentField = new TVField(fieldInfo.number, fieldInfo.storePositionWithTermVector, + fieldInfo.storeOffsetWithTermVector); } /** Finished processing current field. This should be followed by a call to @@ -160,14 +155,21 @@ if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open"); if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open"); - addTermInternal(termText, freq); + addTermInternal(termText, freq, null, null); + } + + public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) + { + addTermInternal(termText, freq, positions, offsets); } - private final void addTermInternal(String termText, int freq) { + private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) { currentField.length += freq; TVTerm term = new TVTerm(); term.termText = termText; term.freq = freq; + term.positions = positions; + term.offsets = offsets; terms.add(term); } @@ -197,16 +199,47 @@ addTermFreqVectorInternal(vector); } + /** Add specified vectors to the document. + */ + public final void addPositionVectors(TermPositionVector[] vectors) + throws IOException { + if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open"); + if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open"); + + for (int i = 0; i < vectors.length; i++) { + addTermPositionVector(vectors[i]); + } + } + + + /** Add specified vector to the document. Document must be open but no field + * should be open or exception is thrown. The same document can have addTerm + * and addVectors calls mixed, however a given field must either be + * populated with addTerm or with addVector. * + */ + public final void addTermPositionVector(TermPositionVector vector) + throws IOException { + if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open"); + if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open"); + addTermPositionVectorInternal(vector); + } private final void addTermFreqVectorInternal(TermFreqVector vector) throws IOException { openField(vector.getField()); for (int i = 0; i < vector.size(); i++) { - addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]); + addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i], null, null); } closeField(); } - + private final void addTermPositionVectorInternal(TermPositionVector vector) + throws IOException { + openField(vector.getField()); + for (int i = 0; i < vector.size(); i++) { + addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i], vector.getTermPositions(i), vector.getOffsets(i)); + } + closeField(); + } /** Close all streams. */ @@ -249,22 +282,101 @@ tvf.writeVInt(size = terms.size()); tvf.writeVInt(currentField.length - size); + boolean storePositions = currentField.storePositions; + boolean storeOffsets = currentField.storeOffsets; + tvf.writeByte(storePositions == true ? (byte)1 :(byte)0); + tvf.writeByte(storeOffsets == true ? (byte)1 : (byte)0); String lastTermText = ""; // write term ids and positions - for (int i = 0; i < size; i++) { - TVTerm term = (TVTerm) terms.elementAt(i); - //tvf.writeString(term.termText); - int start = StringHelper.stringDifference(lastTermText, term.termText); - int length = term.termText.length() - start; - tvf.writeVInt(start); // write shared prefix length - tvf.writeVInt(length); // write delta length - tvf.writeChars(term.termText, start, length); // write delta chars - tvf.writeVInt(term.freq); - lastTermText = term.termText; + //Do it this way, so we don't have to check the flags inside the loop + if (storePositions == false && storeOffsets == false) + { + for (int i = 0; i < size; i++) { + TVTerm term = (TVTerm) terms.elementAt(i); + //tvf.writeString(term.termText); + writeCoreTermInfo(lastTermText, term); + writePositions(null, 0);//store the fact that we aren't storing the info + writeOffsets(null, 0); + lastTermText = term.termText; + } + } + else if (storePositions == true && storeOffsets == false) + { + for (int i = 0; i < size; i++) { + TVTerm term = (TVTerm) terms.elementAt(i); + //tvf.writeString(term.termText); + writeCoreTermInfo(lastTermText, term); + writePositions(term.positions, term.freq); + writeOffsets(null, 0);//store the fact that we aren't storing offsets + lastTermText = term.termText; + } + } + else if (storePositions == false && storeOffsets == true) + { + for (int i = 0; i < size; i++) { + TVTerm term = (TVTerm) terms.elementAt(i); + //tvf.writeString(term.termText); + writeCoreTermInfo(lastTermText, term); + writePositions(null, 0); + writeOffsets(term.offsets, term.freq); + lastTermText = term.termText; + } + } + else + { + for (int i = 0; i < size; i++) { + TVTerm term = (TVTerm) terms.elementAt(i); + //tvf.writeString(term.termText); + writeCoreTermInfo(lastTermText, term); + writePositions(term.positions, term.freq); + writeOffsets(term.offsets, term.freq); + lastTermText = term.termText; + } } } - + private void writeCoreTermInfo(String lastTermText, TVTerm term) throws IOException { + int start = StringHelper.stringDifference(lastTermText, term.termText); + int length = term.termText.length() - start; + tvf.writeVInt(start); // write shared prefix length + tvf.writeVInt(length); // write delta length + tvf.writeChars(term.termText, start, length); // write delta chars + tvf.writeVInt(term.freq); + } + + private void writePositions(int [] positions, int freq) throws IOException + { + if (positions != null && positions.length > 0) + { + tvf.writeByte((byte)1); + tvf.writeVInt(freq); + for (int i = 0; i < freq; i++) { + tvf.writeVInt(positions[i]); + } + } + else + { + tvf.writeByte((byte)0); + } + + } + private void writeOffsets(TermVectorOffsetInfo [] offsets, int freq) throws IOException + { + if (offsets != null && offsets.length > 0) + { + tvf.writeByte((byte)1); + tvf.writeVInt(freq); + + for (int i = 0; i < freq; i++) { + tvf.writeVInt(offsets[i].getStartOffset()); + tvf.writeVInt(offsets[i].getEndOffset() - offsets[i].getStartOffset()); //Save the diff between the two. + } + } + else + { + tvf.writeByte((byte)0); + } + } private void writeDoc() throws IOException { @@ -304,16 +416,20 @@ int number; long tvfPointer = 0; int length = 0; // number of distinct term positions - - TVField(int number) { + boolean storePositions = false; + boolean storeOffsets = false; + TVField(int number, boolean storePos, boolean storeOff) { this.number = number; + storePositions = storePos; + storeOffsets = storeOff; } } private static class TVTerm { String termText; int freq = 0; - //int positions[] = null; + int positions[] = null; + TermVectorOffsetInfo [] offsets = null; } cvs server: Diffing src/java/org/apache/lucene/queryParser cvs server: Diffing src/java/org/apache/lucene/search cvs server: Diffing src/java/org/apache/lucene/search/spans cvs server: Diffing src/java/org/apache/lucene/store cvs server: Diffing src/java/org/apache/lucene/util cvs server: Diffing src/jsp cvs server: Diffing src/jsp/WEB-INF cvs server: Diffing src/test cvs server: Diffing src/test/org cvs server: Diffing src/test/org/apache cvs server: Diffing src/test/org/apache/lucene cvs server: Diffing src/test/org/apache/lucene/analysis cvs server: Diffing src/test/org/apache/lucene/analysis/de cvs server: Diffing src/test/org/apache/lucene/analysis/ru cvs server: Diffing src/test/org/apache/lucene/document cvs server: Diffing src/test/org/apache/lucene/index Index: src/test/org/apache/lucene/index/DocHelper.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/DocHelper.java,v retrieving revision 1.1 diff -u -r1.1 DocHelper.java --- src/test/org/apache/lucene/index/DocHelper.java 20 Feb 2004 20:14:55 -0000 1.1 +++ src/test/org/apache/lucene/index/DocHelper.java 8 Sep 2004 14:33:22 -0000 @@ -1,159 +1,159 @@ -package org.apache.lucene.index; - -/** - * Created by IntelliJ IDEA. - * User: Grant Ingersoll - * Date: Feb 2, 2004 - * Time: 6:16:12 PM - * $Id: DocHelper.java,v 1.1 2004/02/20 20:14:55 cutting Exp $ - * Copyright 2004. Center For Natural Language Processing - */ - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.store.Directory; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Enumeration; - -/** - * - * - **/ -class DocHelper { - public static final String FIELD_1_TEXT = "field one text"; - public static final String TEXT_FIELD_1_KEY = "textField1"; - public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false); - - public static final String FIELD_2_TEXT = "field field field two text"; - //Fields will be lexicographically sorted. So, the order is: field, text, two - public static final int [] FIELD_2_FREQS = {3, 1, 1}; - public static final String TEXT_FIELD_2_KEY = "textField2"; - public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true); - - public static final String KEYWORD_TEXT = "Keyword"; - public static final String KEYWORD_FIELD_KEY = "keyField"; - public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT); - - public static final String UNINDEXED_FIELD_TEXT = "unindexed field text"; - public static final String UNINDEXED_FIELD_KEY = "unIndField"; - public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT); - - public static final String UNSTORED_1_FIELD_TEXT = "unstored field text"; - public static final String UNSTORED_FIELD_1_KEY = "unStoredField1"; - public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false); - - public static final String UNSTORED_2_FIELD_TEXT = "unstored field text"; - public static final String UNSTORED_FIELD_2_KEY = "unStoredField2"; - public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true); - -// public static Set fieldNamesSet = null; -// public static Set fieldValuesSet = null; - public static Map nameValues = null; - - static - { - - nameValues = new HashMap(); - nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT); - nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT); - nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT); - nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT); - nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT); - nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT); - } - - /** - * Adds the fields above to a document - * @param doc The document to write - */ - public static void setupDoc(Document doc) { - doc.add(textField1); - doc.add(textField2); - doc.add(keyField); - doc.add(unIndField); - doc.add(unStoredField1); - doc.add(unStoredField2); - } - /** - * Writes the document to the directory using a segment named "test" - * @param dir - * @param doc - */ - public static void writeDoc(Directory dir, Document doc) - { - - writeDoc(dir, "test", doc); - } - /** - * Writes the document to the directory in the given segment - * @param dir - * @param segment - * @param doc - */ - public static void writeDoc(Directory dir, String segment, Document doc) - { - Analyzer analyzer = new WhitespaceAnalyzer(); - Similarity similarity = Similarity.getDefault(); - writeDoc(dir, analyzer, similarity, segment, doc); - } - /** - * Writes the document to the directory segment named "test" using the specified analyzer and similarity - * @param dir - * @param analyzer - * @param similarity - * @param doc - */ - public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) - { - writeDoc(dir, analyzer, similarity, "test", doc); - } - /** - * Writes the document to the directory segment using the analyzer and the similarity score - * @param dir - * @param analyzer - * @param similarity - * @param segment - * @param doc - */ - public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) - { - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); - try { - writer.addDocument(segment, doc); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static int numFields(Document doc) { - Enumeration fields = doc.fields(); - int result = 0; - while (fields.hasMoreElements()) { - fields.nextElement(); - result++; - } - return result; - } -} -/* - fieldNamesSet = new HashSet(); - fieldNamesSet.add(TEXT_FIELD_1_KEY); - fieldNamesSet.add(TEXT_FIELD_2_KEY); - fieldNamesSet.add(KEYWORD_FIELD_KEY); - fieldNamesSet.add(UNINDEXED_FIELD_KEY); - fieldNamesSet.add(UNSTORED_FIELD_1_KEY); - fieldNamesSet.add(UNSTORED_FIELD_2_KEY); - fieldValuesSet = new HashSet(); - fieldValuesSet.add(FIELD_1_TEXT); - fieldValuesSet.add(FIELD_2_TEXT); - fieldValuesSet.add(KEYWORD_TEXT); - fieldValuesSet.add(UNINDEXED_FIELD_TEXT); - fieldValuesSet.add(UNSTORED_1_FIELD_TEXT); - fieldValuesSet.add(UNSTORED_2_FIELD_TEXT); -*/ +package org.apache.lucene.index; + +/** + * Created by IntelliJ IDEA. + * User: Grant Ingersoll + * Date: Feb 2, 2004 + * Time: 6:16:12 PM + * $Id: DocHelper.java,v 1.1 2004/02/20 20:14:55 cutting Exp $ + * Copyright 2004. Center For Natural Language Processing + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Enumeration; + +/** + * + * + **/ +class DocHelper { + public static final String FIELD_1_TEXT = "field one text"; + public static final String TEXT_FIELD_1_KEY = "textField1"; + public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false); + + public static final String FIELD_2_TEXT = "field field field two text"; + //Fields will be lexicographically sorted. So, the order is: field, text, two + public static final int [] FIELD_2_FREQS = {3, 1, 1}; + public static final String TEXT_FIELD_2_KEY = "textField2"; + public static Field textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + + public static final String KEYWORD_TEXT = "Keyword"; + public static final String KEYWORD_FIELD_KEY = "keyField"; + public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT); + + public static final String UNINDEXED_FIELD_TEXT = "unindexed field text"; + public static final String UNINDEXED_FIELD_KEY = "unIndField"; + public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT); + + public static final String UNSTORED_1_FIELD_TEXT = "unstored field text"; + public static final String UNSTORED_FIELD_1_KEY = "unStoredField1"; + public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false); + + public static final String UNSTORED_2_FIELD_TEXT = "unstored field text"; + public static final String UNSTORED_FIELD_2_KEY = "unStoredField2"; + public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true); + +// public static Set fieldNamesSet = null; +// public static Set fieldValuesSet = null; + public static Map nameValues = null; + + static + { + + nameValues = new HashMap(); + nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT); + nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT); + nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT); + nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT); + nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT); + nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT); + } + + /** + * Adds the fields above to a document + * @param doc The document to write + */ + public static void setupDoc(Document doc) { + doc.add(textField1); + doc.add(textField2); + doc.add(keyField); + doc.add(unIndField); + doc.add(unStoredField1); + doc.add(unStoredField2); + } + /** + * Writes the document to the directory using a segment named "test" + * @param dir + * @param doc + */ + public static void writeDoc(Directory dir, Document doc) + { + + writeDoc(dir, "test", doc); + } + /** + * Writes the document to the directory in the given segment + * @param dir + * @param segment + * @param doc + */ + public static void writeDoc(Directory dir, String segment, Document doc) + { + Analyzer analyzer = new WhitespaceAnalyzer(); + Similarity similarity = Similarity.getDefault(); + writeDoc(dir, analyzer, similarity, segment, doc); + } + /** + * Writes the document to the directory segment named "test" using the specified analyzer and similarity + * @param dir + * @param analyzer + * @param similarity + * @param doc + */ + public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) + { + writeDoc(dir, analyzer, similarity, "test", doc); + } + /** + * Writes the document to the directory segment using the analyzer and the similarity score + * @param dir + * @param analyzer + * @param similarity + * @param segment + * @param doc + */ + public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) + { + DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); + try { + writer.addDocument(segment, doc); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static int numFields(Document doc) { + Enumeration fields = doc.fields(); + int result = 0; + while (fields.hasMoreElements()) { + String name = fields.nextElement().toString(); + result++; + } + return result; + } +} +/* + fieldNamesSet = new HashSet(); + fieldNamesSet.add(TEXT_FIELD_1_KEY); + fieldNamesSet.add(TEXT_FIELD_2_KEY); + fieldNamesSet.add(KEYWORD_FIELD_KEY); + fieldNamesSet.add(UNINDEXED_FIELD_KEY); + fieldNamesSet.add(UNSTORED_FIELD_1_KEY); + fieldNamesSet.add(UNSTORED_FIELD_2_KEY); + fieldValuesSet = new HashSet(); + fieldValuesSet.add(FIELD_1_TEXT); + fieldValuesSet.add(FIELD_2_TEXT); + fieldValuesSet.add(KEYWORD_TEXT); + fieldValuesSet.add(UNINDEXED_FIELD_TEXT); + fieldValuesSet.add(UNSTORED_1_FIELD_TEXT); + fieldValuesSet.add(UNSTORED_2_FIELD_TEXT); +*/ Index: src/test/org/apache/lucene/index/TestDocumentWriter.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestDocumentWriter.java,v retrieving revision 1.2 diff -u -r1.2 TestDocumentWriter.java --- src/test/org/apache/lucene/index/TestDocumentWriter.java 29 Mar 2004 22:48:06 -0000 1.2 +++ src/test/org/apache/lucene/index/TestDocumentWriter.java 8 Sep 2004 14:33:22 -0000 @@ -1,83 +1,83 @@ -package org.apache.lucene.index; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.IOException; - -public class TestDocumentWriter extends TestCase { - private RAMDirectory dir = new RAMDirectory(); - private Document testDoc = new Document(); - - - public TestDocumentWriter(String s) { - super(s); - } - - protected void setUp() { - DocHelper.setupDoc(testDoc); - } - - protected void tearDown() { - - } - - public void test() { - assertTrue(dir != null); - - } - - public void testAddDocument() { - Analyzer analyzer = new WhitespaceAnalyzer(); - Similarity similarity = Similarity.getDefault(); - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); - assertTrue(writer != null); - try { - writer.addDocument("test", testDoc); - //After adding the document, we should be able to read it back in - SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir)); - assertTrue(reader != null); - Document doc = reader.document(0); - assertTrue(doc != null); - - //System.out.println("Document: " + doc); - Field [] fields = doc.getFields("textField2"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT)); - assertTrue(fields[0].isTermVectorStored() == true); - - fields = doc.getFields("textField1"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT)); - assertTrue(fields[0].isTermVectorStored() == false); - - fields = doc.getFields("keyField"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT)); - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } -} +package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.io.IOException; + +public class TestDocumentWriter extends TestCase { + private RAMDirectory dir = new RAMDirectory(); + private Document testDoc = new Document(); + + + public TestDocumentWriter(String s) { + super(s); + } + + protected void setUp() { + DocHelper.setupDoc(testDoc); + } + + protected void tearDown() { + + } + + public void test() { + assertTrue(dir != null); + + } + + public void testAddDocument() { + Analyzer analyzer = new WhitespaceAnalyzer(); + Similarity similarity = Similarity.getDefault(); + DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); + assertTrue(writer != null); + try { + writer.addDocument("test", testDoc); + //After adding the document, we should be able to read it back in + SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir)); + assertTrue(reader != null); + Document doc = reader.document(0); + assertTrue(doc != null); + + //System.out.println("Document: " + doc); + Field [] fields = doc.getFields("textField2"); + assertTrue(fields != null && fields.length == 1); + assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT)); + assertTrue(fields[0].isTermVectorStored() == true); + + fields = doc.getFields("textField1"); + assertTrue(fields != null && fields.length == 1); + assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT)); + assertTrue(fields[0].isTermVectorStored() == false); + + fields = doc.getFields("keyField"); + assertTrue(fields != null && fields.length == 1); + assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT)); + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } +} Index: src/test/org/apache/lucene/index/TestSegmentMerger.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java,v retrieving revision 1.4 diff -u -r1.4 TestSegmentMerger.java --- src/test/org/apache/lucene/index/TestSegmentMerger.java 8 Aug 2004 13:05:33 -0000 1.4 +++ src/test/org/apache/lucene/index/TestSegmentMerger.java 8 Sep 2004 14:33:22 -0000 @@ -109,6 +109,7 @@ int [] freqs = vector.getTermFrequencies(); assertTrue(freqs != null); //System.out.println("Freqs size: " + freqs.length); + assertTrue(vector instanceof TermPositionVector == true); for (int i = 0; i < terms.length; i++) { String term = terms[i]; Index: src/test/org/apache/lucene/index/TestSegmentReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestSegmentReader.java,v retrieving revision 1.3 diff -u -r1.3 TestSegmentReader.java --- src/test/org/apache/lucene/index/TestSegmentReader.java 6 Aug 2004 21:32:51 -0000 1.3 +++ src/test/org/apache/lucene/index/TestSegmentReader.java 8 Sep 2004 14:33:22 -0000 @@ -1,199 +1,199 @@ -package org.apache.lucene.index; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.IOException; -import java.util.Collection; -import java.util.Iterator; -import java.util.Enumeration; - -public class TestSegmentReader extends TestCase { - private RAMDirectory dir = new RAMDirectory(); - private Document testDoc = new Document(); - private SegmentReader reader = null; - - public TestSegmentReader(String s) { - super(s); - } - - //TODO: Setup the reader w/ multiple documents - protected void setUp() { - - try { - DocHelper.setupDoc(testDoc); - DocHelper.writeDoc(dir, testDoc); - reader = new SegmentReader(new SegmentInfo("test", 1, dir)); - } catch (IOException e) { - - } - } - - protected void tearDown() { - - } - - public void test() { - assertTrue(dir != null); - assertTrue(reader != null); - assertTrue(DocHelper.nameValues.size() > 0); - assertTrue(DocHelper.numFields(testDoc) == 6); - } - - public void testDocument() { - try { - assertTrue(reader.numDocs() == 1); - assertTrue(reader.maxDoc() >= 1); - Document result = reader.document(0); - assertTrue(result != null); - //There are 2 unstored fields on the document that are not preserved across writing - assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2); - - Enumeration fields = result.fields(); - while (fields.hasMoreElements()) { - Field field = (Field) fields.nextElement(); - assertTrue(field != null); - assertTrue(DocHelper.nameValues.containsKey(field.name())); - } - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } - - public void testDelete() { - Document docToDelete = new Document(); - DocHelper.setupDoc(docToDelete); - DocHelper.writeDoc(dir, "seg-to-delete", docToDelete); - try { - SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir)); - assertTrue(deleteReader != null); - assertTrue(deleteReader.numDocs() == 1); - deleteReader.delete(0); - assertTrue(deleteReader.isDeleted(0) == true); - assertTrue(deleteReader.hasDeletions() == true); - assertTrue(deleteReader.numDocs() == 0); - try { - Document test = deleteReader.document(0); - assertTrue(false); - } catch (IllegalArgumentException e) { - assertTrue(true); - } - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } - - public void testGetFieldNameVariations() { - Collection result = reader.getFieldNames(); - assertTrue(result != null); - assertTrue(result.size() == 7); - for (Iterator iter = result.iterator(); iter.hasNext();) { - String s = (String) iter.next(); - //System.out.println("Name: " + s); - assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals("")); - } - result = reader.getFieldNames(true); - assertTrue(result != null); - assertTrue(result.size() == 5); - for (Iterator iter = result.iterator(); iter.hasNext();) { - String s = (String) iter.next(); - assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals("")); - } - - result = reader.getFieldNames(false); - assertTrue(result != null); - assertTrue(result.size() == 2); - //Get all indexed fields that are storing term vectors - result = reader.getIndexedFieldNames(true); - assertTrue(result != null); - assertTrue(result.size() == 2); - - result = reader.getIndexedFieldNames(false); - assertTrue(result != null); - assertTrue(result.size() == 3); - } - - public void testTerms() { - try { - TermEnum terms = reader.terms(); - assertTrue(terms != null); - while (terms.next() == true) - { - Term term = terms.term(); - assertTrue(term != null); - //System.out.println("Term: " + term); - String fieldValue = (String)DocHelper.nameValues.get(term.field()); - assertTrue(fieldValue.indexOf(term.text()) != -1); - } - - TermDocs termDocs = reader.termDocs(); - assertTrue(termDocs != null); - termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); - assertTrue(termDocs.next() == true); - - TermPositions positions = reader.termPositions(); - positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); - assertTrue(positions != null); - assertTrue(positions.doc() == 0); - assertTrue(positions.nextPosition() >= 0); - - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } - - public void testNorms() { - //TODO: Not sure how these work/should be tested -/* - try { - byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY); - System.out.println("Norms: " + norms); - assertTrue(norms != null); - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } -*/ - - } - - public void testTermVectors() { - TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); - assertTrue(result != null); - String [] terms = result.getTerms(); - int [] freqs = result.getTermFrequencies(); - assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3); - for (int i = 0; i < terms.length; i++) { - String term = terms[i]; - int freq = freqs[i]; - assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); - assertTrue(freq > 0); - } - - TermFreqVector [] results = reader.getTermFreqVectors(0); - assertTrue(results != null); - assertTrue(results.length == 2); - } - -} +package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.Enumeration; + +public class TestSegmentReader extends TestCase { + private RAMDirectory dir = new RAMDirectory(); + private Document testDoc = new Document(); + private SegmentReader reader = null; + + public TestSegmentReader(String s) { + super(s); + } + + //TODO: Setup the reader w/ multiple documents + protected void setUp() { + + try { + DocHelper.setupDoc(testDoc); + DocHelper.writeDoc(dir, testDoc); + reader = new SegmentReader(new SegmentInfo("test", 1, dir)); + } catch (IOException e) { + + } + } + + protected void tearDown() { + + } + + public void test() { + assertTrue(dir != null); + assertTrue(reader != null); + assertTrue(DocHelper.nameValues.size() > 0); + assertTrue(DocHelper.numFields(testDoc) == 6); + } + + public void testDocument() { + try { + assertTrue(reader.numDocs() == 1); + assertTrue(reader.maxDoc() >= 1); + Document result = reader.document(0); + assertTrue(result != null); + //There are 2 unstored fields on the document that are not preserved across writing + assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2); + + Enumeration fields = result.fields(); + while (fields.hasMoreElements()) { + Field field = (Field) fields.nextElement(); + assertTrue(field != null); + assertTrue(DocHelper.nameValues.containsKey(field.name())); + } + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } + + public void testDelete() { + Document docToDelete = new Document(); + DocHelper.setupDoc(docToDelete); + DocHelper.writeDoc(dir, "seg-to-delete", docToDelete); + try { + SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir)); + assertTrue(deleteReader != null); + assertTrue(deleteReader.numDocs() == 1); + deleteReader.delete(0); + assertTrue(deleteReader.isDeleted(0) == true); + assertTrue(deleteReader.hasDeletions() == true); + assertTrue(deleteReader.numDocs() == 0); + try { + Document test = deleteReader.document(0); + assertTrue(false); + } catch (IllegalArgumentException e) { + assertTrue(true); + } + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } + + public void testGetFieldNameVariations() { + Collection result = reader.getFieldNames(); + assertTrue(result != null); + assertTrue(result.size() == 7); + for (Iterator iter = result.iterator(); iter.hasNext();) { + String s = (String) iter.next(); + //System.out.println("Name: " + s); + assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals("")); + } + result = reader.getFieldNames(true); + assertTrue(result != null); + assertTrue(result.size() == 5); + for (Iterator iter = result.iterator(); iter.hasNext();) { + String s = (String) iter.next(); + assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals("")); + } + + result = reader.getFieldNames(false); + assertTrue(result != null); + assertTrue(result.size() == 2); + //Get all indexed fields that are storing term vectors + result = reader.getIndexedFieldNames(true); + assertTrue(result != null); + assertTrue(result.size() == 2); + + result = reader.getIndexedFieldNames(false); + assertTrue(result != null); + assertTrue(result.size() == 3); + } + + public void testTerms() { + try { + TermEnum terms = reader.terms(); + assertTrue(terms != null); + while (terms.next() == true) + { + Term term = terms.term(); + assertTrue(term != null); + //System.out.println("Term: " + term); + String fieldValue = (String)DocHelper.nameValues.get(term.field()); + assertTrue(fieldValue.indexOf(term.text()) != -1); + } + + TermDocs termDocs = reader.termDocs(); + assertTrue(termDocs != null); + termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); + assertTrue(termDocs.next() == true); + + TermPositions positions = reader.termPositions(); + positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); + assertTrue(positions != null); + assertTrue(positions.doc() == 0); + assertTrue(positions.nextPosition() >= 0); + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } + + public void testNorms() { + //TODO: Not sure how these work/should be tested +/* + try { + byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY); + System.out.println("Norms: " + norms); + assertTrue(norms != null); + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } +*/ + + } + + public void testTermVectors() { + TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); + assertTrue(result != null); + String [] terms = result.getTerms(); + int [] freqs = result.getTermFrequencies(); + assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + int freq = freqs[i]; + assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); + assertTrue(freq > 0); + } + + TermFreqVector [] results = reader.getTermFreqVectors(0); + assertTrue(results != null); + assertTrue(results.length == 2); + } + +} Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/index/TestTermVectorsReader.java,v retrieving revision 1.1 diff -u -r1.1 TestTermVectorsReader.java --- src/test/org/apache/lucene/index/TestTermVectorsReader.java 20 Feb 2004 20:14:55 -0000 1.1 +++ src/test/org/apache/lucene/index/TestTermVectorsReader.java 8 Sep 2004 14:33:23 -0000 @@ -1,106 +1,218 @@ -package org.apache.lucene.index; - - -import junit.framework.TestCase; -import org.apache.lucene.store.RAMDirectory; - -import java.io.IOException; -import java.util.Arrays; - -public class TestTermVectorsReader extends TestCase { - private TermVectorsWriter writer = null; - //Must be lexicographically sorted, will do in setup, versus trying to maintain here - private String [] testFields = {"f1", "f2", "f3"}; - private String [] testTerms = {"this", "is", "a", "test"}; - private RAMDirectory dir = new RAMDirectory(); - private String seg = "testSegment"; - private FieldInfos fieldInfos = new FieldInfos(); - - public TestTermVectorsReader(String s) { - super(s); - } - - protected void setUp() { - for (int i = 0; i < testFields.length; i++) { - fieldInfos.add(testFields[i], true, true); - } - - try { - Arrays.sort(testTerms); - for (int j = 0; j < 5; j++) { - writer = new TermVectorsWriter(dir, seg, fieldInfos); - writer.openDocument(); - - for (int k = 0; k < testFields.length; k++) { - writer.openField(testFields[k]); - for (int i = 0; i < testTerms.length; i++) { - writer.addTerm(testTerms[i], i); - } - writer.closeField(); - } - writer.closeDocument(); - writer.close(); - } - - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } - - protected void tearDown() { - - } - - public void test() { - //Check to see the files were created properly in setup - assertTrue(writer.isDocumentOpen() == false); - assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION)); - assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION)); - } - - public void testReader() { - try { - TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); - assertTrue(reader != null); - TermFreqVector vector = reader.get(0, testFields[0]); - assertTrue(vector != null); - String [] terms = vector.getTerms(); - assertTrue(terms != null); - assertTrue(terms.length == testTerms.length); - for (int i = 0; i < terms.length; i++) { - String term = terms[i]; - //System.out.println("Term: " + term); - assertTrue(term.equals(testTerms[i])); - } - - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - } - - /** - * Make sure exceptions and bad params are handled appropriately - */ - public void testBadParams() { - try { - TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); - assertTrue(reader != null); - //Bad document number, good field number - TermFreqVector vector = reader.get(50, testFields[0]); - assertTrue(vector == null); - } catch (Exception e) { - assertTrue(false); - } - try { - TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); - assertTrue(reader != null); - //good document number, bad field number - TermFreqVector vector = reader.get(0, "f50"); - assertTrue(vector == null); - } catch (Exception e) { - assertTrue(false); - } - } -} +package org.apache.lucene.index; + + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.util.Arrays; + +public class TestTermVectorsReader extends TestCase { + private TermVectorsWriter writer = null; + //Must be lexicographically sorted, will do in setup, versus trying to maintain here + private String [] testFields = {"f1", "f2", "f3"}; + private boolean [] testFieldsStorePos = {true, false, true, false}; + private boolean [] testFieldsStoreOff = {true, false, false, true}; + private String [] testTerms = {"this", "is", "a", "test"}; + private int [][] positions = new int[testTerms.length][]; + private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][]; + private RAMDirectory dir = new RAMDirectory(); + private String seg = "testSegment"; + private FieldInfos fieldInfos = new FieldInfos(); + + public TestTermVectorsReader(String s) { + super(s); + } + + protected void setUp() { + for (int i = 0; i < testFields.length; i++) { + fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); + } + + for (int i = 0; i < testTerms.length; i++) + { + positions[i] = new int[3]; + for (int j = 0; j < positions[i].length; j++) { + positions[i][j] = (int)(Math.random() * 1000); + } + offsets[i] = new TermVectorOffsetInfo[3]; + for (int j = 0; j < offsets[i].length; j++){ + offsets[i][j] = new TermVectorOffsetInfo(0, testTerms[i].length()); + } + } + try { + Arrays.sort(testTerms); + for (int j = 0; j < 5; j++) { + writer = new TermVectorsWriter(dir, seg, fieldInfos); + writer.openDocument(); + + for (int k = 0; k < testFields.length; k++) { + writer.openField(testFields[k]); + for (int i = 0; i < testTerms.length; i++) { + writer.addTerm(testTerms[i], 3, positions[i], offsets[i]); + } + writer.closeField(); + } + writer.closeDocument(); + writer.close(); + } + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } + + protected void tearDown() { + + } + + public void test() { + //Check to see the files were created properly in setup + assertTrue(writer.isDocumentOpen() == false); + assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION)); + assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION)); + } + + public void testReader() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + TermFreqVector vector = reader.get(0, testFields[0]); + assertTrue(vector != null); + String [] terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + } + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + } + + public void testPositionReader() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + TermPositionVector vector; + String [] terms; + vector = (TermPositionVector)reader.get(0, testFields[0]); + assertTrue(vector != null); + terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + int [] positions = vector.getTermPositions(i); + assertTrue(positions != null); + assertTrue(positions.length == this.positions[i].length); + for (int j = 0; j < positions.length; j++) { + int position = positions[j]; + assertTrue(position == this.positions[i][j]); + } + TermVectorOffsetInfo [] offset = vector.getOffsets(i); + assertTrue(offset != null); + assertTrue(offset.length == this.offsets[i].length); + for (int j = 0; j < offset.length; j++) { + TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; + assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); + } + } + + TermFreqVector freqVector = (TermFreqVector)reader.get(0, testFields[1]); //no pos, no offset + assertTrue(freqVector != null); + assertTrue(freqVector instanceof TermPositionVector == false); + terms = freqVector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + } + + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + catch (ClassCastException cce) + { + cce.printStackTrace(); + assertTrue(false); + } + } + + public void testOffsetReader() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]); + assertTrue(vector != null); + String [] terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + int [] positions = vector.getTermPositions(i); + assertTrue(positions != null); + assertTrue(positions.length == this.positions[i].length); + for (int j = 0; j < positions.length; j++) { + int position = positions[j]; + assertTrue(position == this.positions[i][j]); + } + TermVectorOffsetInfo [] offset = vector.getOffsets(i); + assertTrue(offset != null); + assertTrue(offset.length == this.offsets[i].length); + for (int j = 0; j < offset.length; j++) { + TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; + assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); + } + } + + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + catch (ClassCastException cce) + { + cce.printStackTrace(); + assertTrue(false); + } + } + + + /** + * Make sure exceptions and bad params are handled appropriately + */ + public void testBadParams() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + //Bad document number, good field number + TermFreqVector vector = reader.get(50, testFields[0]); + assertTrue(vector == null); + } catch (Exception e) { + assertTrue(false); + } + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + //good document number, bad field number + TermFreqVector vector = reader.get(0, "f50"); + assertTrue(vector == null); + } catch (Exception e) { + assertTrue(false); + } + } +} cvs server: Diffing src/test/org/apache/lucene/index/store cvs server: Diffing src/test/org/apache/lucene/queryParser cvs server: Diffing src/test/org/apache/lucene/search Index: src/test/org/apache/lucene/search/TestTermVectors.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/test/org/apache/lucene/search/TestTermVectors.java,v retrieving revision 1.4 diff -u -r1.4 TestTermVectors.java --- src/test/org/apache/lucene/search/TestTermVectors.java 7 Sep 2004 18:26:36 -0000 1.4 +++ src/test/org/apache/lucene/search/TestTermVectors.java 8 Sep 2004 14:33:23 -0000 @@ -1,222 +1,298 @@ -package org.apache.lucene.search; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; -import org.apache.lucene.analysis.SimpleAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.*; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.English; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -public class TestTermVectors extends TestCase { - private IndexSearcher searcher; - private RAMDirectory directory = new RAMDirectory(); - public TestTermVectors(String s) { - super(s); - } - - public void setUp() throws Exception { - IndexWriter writer - = new IndexWriter(directory, new SimpleAnalyzer(), true); - //writer.setUseCompoundFile(true); - //writer.infoStream = System.out; - for (int i = 0; i < 1000; i++) { - Document doc = new Document(); - doc.add(new Field("field", English.intToEnglish(i), - Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); - writer.addDocument(doc); - } - writer.close(); - searcher = new IndexSearcher(directory); - } - - protected void tearDown() { - - } - - public void test() { - assertTrue(searcher != null); - } - - public void testTermVectors() { - Query query = new TermQuery(new Term("field", "seventy")); - try { - Hits hits = searcher.search(query); - assertEquals(100, hits.length()); - - for (int i = 0; i < hits.length(); i++) - { - TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); - assertTrue(vector != null); - assertTrue(vector.length == 1); - //assertTrue(); - } - TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50)); - //System.out.println("Explain: " + searcher.explain(query, hits.id(50))); - //System.out.println("Vector: " + vector[0].toString()); - } catch (IOException e) { - assertTrue(false); - } - } - - public void testTermPositionVectors() { - Query query = new TermQuery(new Term("field", "fifty")); - try { - Hits hits = searcher.search(query); - assertEquals(100, hits.length()); - - for (int i = 0; i < hits.length(); i++) - { - TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); - assertTrue(vector != null); - assertTrue(vector.length == 1); - //assertTrue(); - } - } catch (IOException e) { - assertTrue(false); - } - } - - public void testKnownSetOfDocuments() { - String test1 = "eating chocolate in a computer lab"; //6 terms - String test2 = "computer in a computer lab"; //5 terms - String test3 = "a chocolate lab grows old"; //5 terms - String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms - Map test4Map = new HashMap(); - test4Map.put("chocolate", new Integer(3)); - test4Map.put("lab", new Integer(2)); - test4Map.put("eating", new Integer(1)); - test4Map.put("computer", new Integer(1)); - test4Map.put("with", new Integer(1)); - test4Map.put("a", new Integer(1)); - test4Map.put("colored", new Integer(1)); - test4Map.put("in", new Integer(1)); - test4Map.put("an", new Integer(1)); - test4Map.put("computer", new Integer(1)); - test4Map.put("old", new Integer(1)); - - Document testDoc1 = new Document(); - setupDoc(testDoc1, test1); - Document testDoc2 = new Document(); - setupDoc(testDoc2, test2); - Document testDoc3 = new Document(); - setupDoc(testDoc3, test3); - Document testDoc4 = new Document(); - setupDoc(testDoc4, test4); - - Directory dir = new RAMDirectory(); - - try { - IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); - assertTrue(writer != null); - writer.addDocument(testDoc1); - writer.addDocument(testDoc2); - writer.addDocument(testDoc3); - writer.addDocument(testDoc4); - writer.close(); - IndexSearcher knownSearcher = new IndexSearcher(dir); - TermEnum termEnum = knownSearcher.reader.terms(); - TermDocs termDocs = knownSearcher.reader.termDocs(); - //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); - - Similarity sim = knownSearcher.getSimilarity(); - while (termEnum.next() == true) - { - Term term = termEnum.term(); - //System.out.println("Term: " + term); - termDocs.seek(term); - while (termDocs.next()) - { - int docId = termDocs.doc(); - int freq = termDocs.freq(); - //System.out.println("Doc Id: " + docId + " freq " + freq); - TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field"); - float tf = sim.tf(freq); - float idf = sim.idf(term, knownSearcher); - //float qNorm = sim.queryNorm() - //This is fine since we don't have stop words - float lNorm = sim.lengthNorm("field", vector.getTerms().length); - //float coord = sim.coord() - //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); - assertTrue(vector != null); - String[] vTerms = vector.getTerms(); - int [] freqs = vector.getTermFrequencies(); - for (int i = 0; i < vTerms.length; i++) - { - if (term.text().equals(vTerms[i]) == true) - { - assertTrue(freqs[i] == freq); - } - } - - } - //System.out.println("--------"); - } - Query query = new TermQuery(new Term("field", "chocolate")); - Hits hits = knownSearcher.search(query); - //doc 3 should be the first hit b/c it is the shortest match - assertTrue(hits.length() == 3); - float score = hits.score(0); - /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); - System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); - System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ - assertTrue(testDoc3.toString().equals(hits.doc(0).toString())); - assertTrue(testDoc4.toString().equals(hits.doc(1).toString())); - assertTrue(testDoc1.toString().equals(hits.doc(2).toString())); - TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field"); - assertTrue(vector != null); - //System.out.println("Vector: " + vector); - String[] terms = vector.getTerms(); - int [] freqs = vector.getTermFrequencies(); - assertTrue(terms != null && terms.length == 10); - for (int i = 0; i < terms.length; i++) { - String term = terms[i]; - //System.out.println("Term: " + term); - int freq = freqs[i]; - assertTrue(test4.indexOf(term) != -1); - Integer freqInt = (Integer)test4Map.get(term); - assertTrue(freqInt != null); - assertTrue(freqInt.intValue() == freq); - } - knownSearcher.close(); - } catch (IOException e) { - e.printStackTrace(); - assertTrue(false); - } - - - } - - private void setupDoc(Document doc, String text) - { - doc.add(new Field("field", text, Field.Store.YES, - Field.Index.TOKENIZED, Field.TermVector.YES)); - //System.out.println("Document: " + doc); - } - - -} +package org.apache.lucene.search; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.English; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class TestTermVectors extends TestCase { + private IndexSearcher searcher; + private RAMDirectory directory = new RAMDirectory(); + public TestTermVectors(String s) { + super(s); + } + + public void setUp() throws Exception { + IndexWriter writer + = new IndexWriter(directory, new SimpleAnalyzer(), true); + //writer.setUseCompoundFile(true); + //writer.infoStream = System.out; + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + Field.TermVector termVector; + int mod3 = i % 3; + int mod2 = i % 2; + if (mod2 == 0 && mod3 == 0){ + termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; + } + else if (mod2 == 0){ + termVector = Field.TermVector.WITH_POSITIONS; + } + else if (mod3 == 0){ + termVector = Field.TermVector.WITH_POSITIONS; + } + else { + termVector = Field.TermVector.YES; + } + doc.add(new Field("field", English.intToEnglish(i), + Field.Store.YES, Field.Index.TOKENIZED, termVector)); + writer.addDocument(doc); + } + writer.close(); + searcher = new IndexSearcher(directory); + } + + protected void tearDown() { + + } + + public void test() { + assertTrue(searcher != null); + } + + public void testTermVectors() { + Query query = new TermQuery(new Term("field", "seventy")); + try { + Hits hits = searcher.search(query); + assertEquals(100, hits.length()); + + for (int i = 0; i < hits.length(); i++) + { + TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); + assertTrue(vector != null); + assertTrue(vector.length == 1); + //assertTrue(); + } + TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50)); + //System.out.println("Explain: " + searcher.explain(query, hits.id(50))); + //System.out.println("Vector: " + vector[0].toString()); + } catch (IOException e) { + assertTrue(false); + } + } + + public void testTermPositionVectors() { + Query query = new TermQuery(new Term("field", "zero")); + try { + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + + for (int i = 0; i < hits.length(); i++) + { + TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); + assertTrue(vector != null); + assertTrue(vector.length == 1); + boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true : false; + assertTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true))); + if (shouldBePosVector == true) + { + TermPositionVector posVec = (TermPositionVector)vector[0]; + String [] terms = posVec.getTerms(); + assertTrue(terms != null && terms.length > 0); + for (int j = 0; j < terms.length; j++) { + int [] positions = posVec.getTermPositions(j); + assertTrue(positions != null); + assertTrue(positions.length > 0); + } + } + boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true : false; + if (shouldBeOffVector == true) + { + TermPositionVector posVec = (TermPositionVector)vector[0]; + String [] terms = posVec.getTerms(); + assertTrue(terms != null && terms.length > 0); + for (int j = 0; j < terms.length; j++) { + String term = terms[j]; + TermVectorOffsetInfo [] offsets = posVec.getOffsets(j); + assertTrue(offsets != null); + assertTrue(offsets.length > 0); + } + } + boolean shouldBeBothVector = (hits.id(i) % 6 == 0) ? true : false; + //System.out.println("Hit Id: " + hits.id(i)); + if (shouldBeBothVector == true) + { + TermPositionVector posVec = (TermPositionVector)vector[0]; + String [] terms = posVec.getTerms(); + assertTrue(terms != null && terms.length > 0); + for (int j = 0; j < terms.length; j++) { + TermVectorOffsetInfo [] offsets = posVec.getOffsets(j); + assertTrue(offsets != null); + assertTrue(offsets.length > 0); + int [] positions = posVec.getTermPositions(j); + assertTrue(positions != null); + assertTrue(positions.length > 0); + } + } + //assertTrue(); + } + } catch (IOException e) { + assertTrue(false); + } + } + + public void testTermOffsetVectors() { + Query query = new TermQuery(new Term("field", "fifty")); + try { + Hits hits = searcher.search(query); + assertEquals(100, hits.length()); + + for (int i = 0; i < hits.length(); i++) + { + TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); + assertTrue(vector != null); + assertTrue(vector.length == 1); + + //assertTrue(); + } + } catch (IOException e) { + assertTrue(false); + } + } + + public void testKnownSetOfDocuments() { + String test1 = "eating chocolate in a computer lab"; //6 terms + String test2 = "computer in a computer lab"; //5 terms + String test3 = "a chocolate lab grows old"; //5 terms + String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms + Map test4Map = new HashMap(); + test4Map.put("chocolate", new Integer(3)); + test4Map.put("lab", new Integer(2)); + test4Map.put("eating", new Integer(1)); + test4Map.put("computer", new Integer(1)); + test4Map.put("with", new Integer(1)); + test4Map.put("a", new Integer(1)); + test4Map.put("colored", new Integer(1)); + test4Map.put("in", new Integer(1)); + test4Map.put("an", new Integer(1)); + test4Map.put("computer", new Integer(1)); + test4Map.put("old", new Integer(1)); + + Document testDoc1 = new Document(); + setupDoc(testDoc1, test1); + Document testDoc2 = new Document(); + setupDoc(testDoc2, test2); + Document testDoc3 = new Document(); + setupDoc(testDoc3, test3); + Document testDoc4 = new Document(); + setupDoc(testDoc4, test4); + + Directory dir = new RAMDirectory(); + + try { + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); + assertTrue(writer != null); + writer.addDocument(testDoc1); + writer.addDocument(testDoc2); + writer.addDocument(testDoc3); + writer.addDocument(testDoc4); + writer.close(); + IndexSearcher knownSearcher = new IndexSearcher(dir); + TermEnum termEnum = knownSearcher.reader.terms(); + TermDocs termDocs = knownSearcher.reader.termDocs(); + //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); + + Similarity sim = knownSearcher.getSimilarity(); + while (termEnum.next() == true) + { + Term term = termEnum.term(); + //System.out.println("Term: " + term); + termDocs.seek(term); + while (termDocs.next()) + { + int docId = termDocs.doc(); + int freq = termDocs.freq(); + //System.out.println("Doc Id: " + docId + " freq " + freq); + TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field"); + float tf = sim.tf(freq); + float idf = sim.idf(term, knownSearcher); + //float qNorm = sim.queryNorm() + //This is fine since we don't have stop words + float lNorm = sim.lengthNorm("field", vector.getTerms().length); + //float coord = sim.coord() + //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); + assertTrue(vector != null); + String[] vTerms = vector.getTerms(); + int [] freqs = vector.getTermFrequencies(); + for (int i = 0; i < vTerms.length; i++) + { + if (term.text().equals(vTerms[i]) == true) + { + assertTrue(freqs[i] == freq); + } + } + + } + //System.out.println("--------"); + } + Query query = new TermQuery(new Term("field", "chocolate")); + Hits hits = knownSearcher.search(query); + //doc 3 should be the first hit b/c it is the shortest match + assertTrue(hits.length() == 3); + float score = hits.score(0); + /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); + System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); + System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); + System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); + System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); + System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ + assertTrue(hits.id(0) == 2); + assertTrue(hits.id(1) == 3); + assertTrue(hits.id(2) == 0); + TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field"); + assertTrue(vector != null); + //System.out.println("Vector: " + vector); + String[] terms = vector.getTerms(); + int [] freqs = vector.getTermFrequencies(); + assertTrue(terms != null && terms.length == 10); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + int freq = freqs[i]; + assertTrue(test4.indexOf(term) != -1); + Integer freqInt = (Integer)test4Map.get(term); + assertTrue(freqInt != null); + assertTrue(freqInt.intValue() == freq); + } + knownSearcher.close(); + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + + + } + + private void setupDoc(Document doc, String text) + { + doc.add(new Field("field", text, Field.Store.YES, + Field.Index.TOKENIZED, Field.TermVector.YES)); + //System.out.println("Document: " + doc); + } + + +} cvs server: Diffing src/test/org/apache/lucene/search/spans cvs server: Diffing src/test/org/apache/lucene/store cvs server: Diffing src/test/org/apache/lucene/util cvs server: Diffing xdocs cvs server: Diffing xdocs/images cvs server: Diffing xdocs/lucene-sandbox cvs server: Diffing xdocs/lucene-sandbox/larm cvs server: Diffing xdocs/stylesheets