Index: src/test/org/apache/lucene/index/TestFieldsReader.java
===================================================================
--- src/test/org/apache/lucene/index/TestFieldsReader.java (revision 382277)
+++ src/test/org/apache/lucene/index/TestFieldsReader.java (working copy)
@@ -34,9 +34,22 @@
super(s);
}
+ public static final String FIELD_UTF1_TEXT = "field one \u4e00text";
+ public static final String TEXT_FIELD_UTF1_KEY = "textField1Utf8";
+ public static Field textUtfField1 = new Field(TEXT_FIELD_UTF1_KEY, FIELD_UTF1_TEXT,
+ Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+
+ public static final String FIELD_UTF2_TEXT = "field field field \u4e00two text";
+ //Fields will be lexicographically sorted. So, the order is: field, text, two
+ public static final int [] FIELD_UTF2_FREQS = {3, 1, 1};
+ public static final String TEXT_FIELD_UTF2_KEY = "textField2Utf8";
+ public static Field textUtfField2 = new Field(TEXT_FIELD_UTF2_KEY, FIELD_UTF2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+
protected void setUp() throws IOException {
fieldInfos = new FieldInfos();
DocHelper.setupDoc(testDoc);
+ testDoc.add(textUtfField1);
+ testDoc.add(textUtfField2);
fieldInfos.add(testDoc);
DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(),
Similarity.getDefault(), 50);
@@ -69,6 +82,93 @@
assertTrue(field.isStorePositionWithTermVector() == false);
assertTrue(field.getOmitNorms() == true);
+ reader.close();
+ }
+
+ public void testUtf8() throws IOException {
+ assertTrue(dir != null);
+ assertTrue(fieldInfos != null);
+ FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
+ assertTrue(reader != null);
+ assertTrue(reader.size() == 1);
+ Document doc = reader.doc(0);
+ assertTrue(doc != null);
+ assertTrue(doc.getField("textField1") != null);
+
+ Field field = doc.getField(TEXT_FIELD_UTF1_KEY);
+ assertTrue(field != null);
+ assertEquals(FIELD_UTF1_TEXT, field.stringValue());
+
+ field = doc.getField(TEXT_FIELD_UTF2_KEY);
+ assertTrue(field != null);
+ assertTrue(field.isTermVectorStored() == true);
+
+ assertTrue(field.isStoreOffsetWithTermVector() == true);
+ assertTrue(field.isStorePositionWithTermVector() == true);
+ assertTrue(field.getOmitNorms() == false);
+ assertEquals(FIELD_UTF2_TEXT, field.stringValue());
+
+ reader.close();
+
+ }
+
+ public void testDocField() throws IOException {
+ assertTrue(dir != null);
+ assertTrue(fieldInfos != null);
+ FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
+ assertTrue(reader != null);
+ assertTrue(reader.size() == 1);
+
+ Field field = reader.docField(0,"textField1");
+ assertTrue(field != null);
+ assertEquals(DocHelper.FIELD_1_TEXT, field.stringValue());
+
+ field = reader.docField(0,"textField2");
+ assertTrue(field != null);
+ assertTrue(field.isTermVectorStored() == true);
+
+ assertTrue(field.isStoreOffsetWithTermVector() == true);
+ assertTrue(field.isStorePositionWithTermVector() == true);
+ assertTrue(field.getOmitNorms() == false);
+ assertEquals(DocHelper.FIELD_2_TEXT, field.stringValue());
+
+ field = reader.docField(0, "textField3");
+ assertTrue(field != null);
+ assertTrue(field.isTermVectorStored() == false);
+ assertTrue(field.isStoreOffsetWithTermVector() == false);
+ assertTrue(field.isStorePositionWithTermVector() == false);
+ assertTrue(field.getOmitNorms() == true);
+ assertEquals(DocHelper.FIELD_3_TEXT, field.stringValue());
+
+ assertTrue(reader.docField(0,"doesnotexist") == null);
+
+ reader.close();
+ }
+ public void testDocFieldUtf8() throws IOException {
+ assertTrue(dir != null);
+ assertTrue(fieldInfos != null);
+ FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
+ assertTrue(reader != null);
+ assertTrue(reader.size() == 1);
+
+ // Just make sure that the addition of the chinese character worked correctly.
+ assertEquals(FIELD_UTF1_TEXT.length(), DocHelper.FIELD_1_TEXT.length() + 1);
+ assertEquals(FIELD_UTF1_TEXT.getBytes("UTF-8").length, DocHelper.FIELD_1_TEXT.getBytes("UTF-8").length + 3);
+ assertEquals(FIELD_UTF2_TEXT.length(), DocHelper.FIELD_2_TEXT.length() + 1);
+ assertEquals(FIELD_UTF2_TEXT.getBytes("UTF-8").length, DocHelper.FIELD_2_TEXT.getBytes("UTF-8").length + 3);
+
+ Field field = reader.docField(0,TEXT_FIELD_UTF1_KEY);
+ assertTrue(field != null);
+ assertEquals(FIELD_UTF1_TEXT, field.stringValue());
+
+ field = reader.docField(0,TEXT_FIELD_UTF2_KEY);
+ assertTrue(field != null);
+ assertTrue(field.isTermVectorStored() == true);
+
+ assertTrue(field.isStoreOffsetWithTermVector() == true);
+ assertTrue(field.isStorePositionWithTermVector() == true);
+ assertTrue(field.getOmitNorms() == false);
+ assertEquals(FIELD_UTF2_TEXT, field.stringValue());
reader.close();
}
Index: src/java/org/apache/lucene/index/MultiReader.java
===================================================================
--- src/java/org/apache/lucene/index/MultiReader.java (revision 382277)
+++ src/java/org/apache/lucene/index/MultiReader.java (working copy)
@@ -104,6 +104,16 @@
return subReaders[i].document(n - starts[i]); // dispatch to segment reader
}
+ public Field getDocField(int n, String field) throws IOException {
+ int i = readerIndex(n); // find segment num
+ return subReaders[i].getDocField(n - starts[i], field); // dispatch to segment reader
+ }
+
+ public String getDocFieldString(int n, String field) throws IOException {
+ int i = readerIndex(n); // find segment num
+ return subReaders[i].getDocFieldString(n - starts[i], field); // dispatch to segment reader
+ }
+
public boolean isDeleted(int n) {
int i = readerIndex(n); // find segment num
return subReaders[i].isDeleted(n - starts[i]); // dispatch to segment reader
Index: src/java/org/apache/lucene/index/FieldsReader.java
===================================================================
--- src/java/org/apache/lucene/index/FieldsReader.java (revision 382277)
+++ src/java/org/apache/lucene/index/FieldsReader.java (working copy)
@@ -57,6 +57,11 @@
return size;
}
+ /**
+ * Retrieve a Document that contains all of the fields defined in the index
+ * @param n the document number
+ * @return
+ */
final Document doc(int n) throws IOException {
indexStream.seek(n * 8L);
long position = indexStream.readLong();
@@ -67,9 +72,22 @@
for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
+ doc.add(getFieldFromStream(fi));
+ }
- byte bits = fieldsStream.readByte();
+ return doc;
+ }
+ /**
+ * Retrive a particular field from fieldsStream, which is currently
+ * looking at the bits byte of a field. When finished, the fieldStream
+ * will be looking at the fieldNum of the next stored field.
+ * @param fi the FieldInfo for the field being examined (based on the
+ * previous vint in the stream)
+ * @return the Field.
+ */
+ private Field getFieldFromStream(FieldInfo fi) throws IOException {
+ byte bits = fieldsStream.readByte();
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
@@ -77,9 +95,9 @@
final byte[] b = new byte[fieldsStream.readVInt()];
fieldsStream.readBytes(b, 0, b.length);
if (compressed)
- doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
+ return new Field(fi.name, uncompress(b), Field.Store.COMPRESS);
else
- doc.add(new Field(fi.name, b, Field.Store.YES));
+ return new Field(fi.name, b, Field.Store.YES);
}
else {
Field.Index index;
@@ -123,7 +141,7 @@
index,
termVector);
f.setOmitNorms(fi.omitNorms);
- doc.add(f);
+ return f;
}
else {
Field f = new Field(fi.name, // name
@@ -132,13 +150,126 @@
index,
termVector);
f.setOmitNorms(fi.omitNorms);
- doc.add(f);
+ return f;
}
}
}
- return doc;
+
+ /**
+ * Retrieve the string value in the given document with the
+ * specified field name. If there are more than one field
+ * in the document with that field name, only the first one
+ * is returned.
+ * @param n the document to retrieve
+ * @param fieldName the name of the field to retrieve
+ * @return the first field in the document with that name, or null
+ * if the document doesn't have such a field stored.
+ * @see Document#getField(String)
+ * @throws IOException
+ */
+ final Field docField(int n, String fieldName) throws IOException {
+ int fieldNo = fieldInfos.fieldNumber(fieldName);
+ if (fieldNo < 0) return null;
+
+ // Seek to the start of all the fields
+ indexStream.seek(n * 8L);
+ long position = indexStream.readLong();
+ fieldsStream.seek(position);
+
+ int numFields = fieldsStream.readVInt();
+
+ for (int i = 0; i < numFields; i++) {
+ int fieldNumber = fieldsStream.readVInt();
+
+ if (fieldNumber == fieldNo) {
+ // This is the field we want
+ FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
+ return getFieldFromStream(fi);
+ } else {
+ // This is the field we want to skip
+ byte bits = fieldsStream.readByte(); // The bits
+ boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+ boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
+
+ int dataLength = fieldsStream.readVInt(); // Length to skip;
+
+ if (compressed || binary) {
+ fieldsStream.seek(fieldsStream.getFilePointer() + dataLength);
+ } else {
+ // If not compressed or binary, we store the number of chars, not number of bytes.
+ for (int j = 0; j < dataLength; j++) {
+ fieldsStream.readChar();
+ }
+ }
+ continue;
+ }
+
}
+ // The field wasn't defined on the document, so ignore it.
+ return null;
+ }
+
+ /**
+ * Retrieve the first field in the given document with the
+ * specified field name. If there are more than one field
+ * in the document with that field name, only the first one
+ * is returned.
+ * @param n the document to retrieve
+ * @param fieldName the name of the field to retrieve
+ * @return the string value of the field in the document with that name, or null
+ * if the document doesn't have such a field stored, or if the value stored
+ * is a binary value.
+ * @see Document#getField(String)
+ * @throws IOException
+ */
+ final String docFieldString(int n, String fieldName) throws IOException {
+ int fieldNo = fieldInfos.fieldNumber(fieldName);
+ if (fieldNo < 0) return null;
+
+ // Seek to the start of all the fields
+ indexStream.seek(n * 8L);
+ long position = indexStream.readLong();
+ fieldsStream.seek(position);
+
+ int numFields = fieldsStream.readVInt();
+
+ for (int i = 0; i < numFields; i++) {
+ int fieldNumber = fieldsStream.readVInt();
+ byte bits = fieldsStream.readByte(); // The bits
+ boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+ boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
+
+ int dataLength = fieldsStream.readVInt(); // Length to skip;
+
+ if (fieldNumber == fieldNo) {
+ // This is the field we want to return
+ if (binary) return null; // Don't return binary
+ if (compressed) {
+ final byte[] b = new byte[dataLength];
+ fieldsStream.readBytes(b, 0, b.length);
+ return new String(uncompress(b), "UTF-8"); // uncompress the value and add as string
+ } else {
+ return fieldsStream.readString();
+ }
+ } else {
+ // This is the field we want to skip
+ if (compressed || binary) {
+ fieldsStream.seek(fieldsStream.getFilePointer() + dataLength);
+ } else {
+ // If not compressed or binary, we store the number of chars, not number of bytes.
+ for (int j = 0; j < dataLength; j++) {
+ fieldsStream.readChar();
+ }
+ }
+ continue;
+ }
+
+ }
+ // The field wasn't defined on the document, so ignore it.
+ return null;
+ }
+
private final byte[] uncompress(final byte[] input)
throws IOException
Index: src/java/org/apache/lucene/index/IndexReader.java
===================================================================
--- src/java/org/apache/lucene/index/IndexReader.java (revision 382277)
+++ src/java/org/apache/lucene/index/IndexReader.java (working copy)
@@ -357,6 +357,36 @@
Document in this index. */
public abstract Document document(int n) throws IOException;
+ /**
+ * Return the document field for the given document. If querying for
+ * only one field on a document, implementations may make this more efficient
+ * that calling document(doc).getField(field). It will only return the
+ * first value of the field in the document, like {@link Document#getField(String)}
+ *
+ * Only use this function if you know there can be only one value for the field
+ * (like a document id), this is the only field you want,
+ * and you want to reduce the overhead of querying.
+ */
+ public Field getDocField(int doc, String field) throws IOException {
+ return document(doc).getField(field);
+ }
+
+ /**
+ * Return the document field for the given document. If querying for
+ * only one field on a document, implementations may make this more efficient
+ * that calling document(doc).getField(field). It will only return the
+ * first value of the field in the document, like {@link Document#getField(String)}
+ *
+ * Only use this function if you know there can be only one value for the field
+ * (like a document id), this is the only field you want,
+ * and you want to reduce the overhead of querying.
+ */
+ public String getDocFieldString(int doc, String field) throws IOException {
+ Field result = document(doc).getField(field);
+ return result == null ? null : result.stringValue();
+ }
+
+
/** Returns true if document n has been deleted */
public abstract boolean isDeleted(int n);
Index: src/java/org/apache/lucene/index/SegmentReader.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentReader.java (revision 382277)
+++ src/java/org/apache/lucene/index/SegmentReader.java (working copy)
@@ -51,6 +51,7 @@
// Compound File Reader when based on a compound file segment
CompoundFileReader cfsReader = null;
+
private class Norm {
public Norm(IndexInput in, int number)
{
@@ -284,6 +285,20 @@
return fieldsReader.doc(n);
}
+ public Field getDocField(int doc, String fieldName) throws IOException {
+ if (isDeleted(doc))
+ throw new IllegalArgumentException
+ ("attempt to access a deleted document");
+ return fieldsReader.docField(doc, fieldName);
+ }
+
+ public String getDocFieldString(int doc, String fieldName) throws IOException {
+ if (isDeleted(doc))
+ throw new IllegalArgumentException
+ ("attempt to access a deleted document");
+ return fieldsReader.docFieldString(doc, fieldName);
+}
+
public synchronized boolean isDeleted(int n) {
return (deletedDocs != null && deletedDocs.get(n));
}
Index: src/java/org/apache/lucene/store/IndexInput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexInput.java (revision 382277)
+++ src/java/org/apache/lucene/store/IndexInput.java (working copy)
@@ -103,18 +103,26 @@
throws IOException {
final int end = start + length;
for (int i = start; i < end; i++) {
+ buffer[i] = readChar();
+ }
+ }
+
+ /** Reads a single UTF-8 encoded character
+ * @return the next character encoded as encoded in UTF-8 format.
+ * @see IndexOutput#writeChars(String,int,int)
+ */
+ public final char readChar() throws IOException {
byte b = readByte();
if ((b & 0x80) == 0)
- buffer[i] = (char)(b & 0x7F);
+ return (char)(b & 0x7F);
else if ((b & 0xE0) != 0xE0) {
- buffer[i] = (char)(((b & 0x1F) << 6)
+ return (char)(((b & 0x1F) << 6)
| (readByte() & 0x3F));
} else
- buffer[i] = (char)(((b & 0x0F) << 12)
+ return (char)(((b & 0x0F) << 12)
| ((readByte() & 0x3F) << 6)
| (readByte() & 0x3F));
}
- }
/** Closes the stream to futher operations. */
public abstract void close() throws IOException;