Index: src/test/org/apache/lucene/index/TestFieldsReader.java =================================================================== --- src/test/org/apache/lucene/index/TestFieldsReader.java (revision 382277) +++ src/test/org/apache/lucene/index/TestFieldsReader.java (working copy) @@ -34,9 +34,22 @@ super(s); } + public static final String FIELD_UTF1_TEXT = "field one \u4e00text"; + public static final String TEXT_FIELD_UTF1_KEY = "textField1Utf8"; + public static Field textUtfField1 = new Field(TEXT_FIELD_UTF1_KEY, FIELD_UTF1_TEXT, + Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO); + + public static final String FIELD_UTF2_TEXT = "field field field \u4e00two text"; + //Fields will be lexicographically sorted. So, the order is: field, text, two + public static final int [] FIELD_UTF2_FREQS = {3, 1, 1}; + public static final String TEXT_FIELD_UTF2_KEY = "textField2Utf8"; + public static Field textUtfField2 = new Field(TEXT_FIELD_UTF2_KEY, FIELD_UTF2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + protected void setUp() throws IOException { fieldInfos = new FieldInfos(); DocHelper.setupDoc(testDoc); + testDoc.add(textUtfField1); + testDoc.add(textUtfField2); fieldInfos.add(testDoc); DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), 50); @@ -69,6 +82,111 @@ assertTrue(field.isStorePositionWithTermVector() == false); assertTrue(field.getOmitNorms() == true); + reader.close(); + } + + public void testUtf8() throws IOException { + assertTrue(dir != null); + assertTrue(fieldInfos != null); + FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + assertTrue(reader != null); + assertTrue(reader.size() == 1); + Document doc = reader.doc(0); + assertTrue(doc != null); + assertTrue(doc.getField("textField1") != null); + + Field field = doc.getField(TEXT_FIELD_UTF1_KEY); + assertTrue(field != null); + assertEquals(FIELD_UTF1_TEXT, field.stringValue()); + + field = doc.getField(TEXT_FIELD_UTF2_KEY); + assertTrue(field != null); + assertTrue(field.isTermVectorStored() == true); + + assertTrue(field.isStoreOffsetWithTermVector() == true); + assertTrue(field.isStorePositionWithTermVector() == true); + assertTrue(field.getOmitNorms() == false); + assertEquals(FIELD_UTF2_TEXT, field.stringValue()); + + reader.close(); + + } + + public void testDocField() throws IOException { + assertTrue(dir != null); + assertTrue(fieldInfos != null); + FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + assertTrue(reader != null); + assertTrue(reader.size() == 1); + + Field field = reader.docField(0,"textField1"); + assertTrue(field != null); + assertEquals(DocHelper.FIELD_1_TEXT, field.stringValue()); + + field = reader.docField(0,"textField2"); + assertTrue(field != null); + assertTrue(field.isTermVectorStored() == true); + + assertTrue(field.isStoreOffsetWithTermVector() == true); + assertTrue(field.isStorePositionWithTermVector() == true); + assertTrue(field.getOmitNorms() == false); + assertEquals(DocHelper.FIELD_2_TEXT, field.stringValue()); + + field = reader.docField(0, "textField3"); + assertTrue(field != null); + assertTrue(field.isTermVectorStored() == false); + assertTrue(field.isStoreOffsetWithTermVector() == false); + assertTrue(field.isStorePositionWithTermVector() == false); + assertTrue(field.getOmitNorms() == true); + assertEquals(DocHelper.FIELD_3_TEXT, field.stringValue()); + + assertTrue(reader.docField(0,"doesnotexist") == null); + + reader.close(); + } + public void testDocFieldUtf8() throws IOException { + assertTrue(dir != null); + assertTrue(fieldInfos != null); + FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + assertTrue(reader != null); + assertTrue(reader.size() == 1); + + // Just make sure that the addition of the chinese character worked correctly. + assertEquals(FIELD_UTF1_TEXT.length(), DocHelper.FIELD_1_TEXT.length() + 1); + assertEquals(FIELD_UTF1_TEXT.getBytes("UTF-8").length, DocHelper.FIELD_1_TEXT.getBytes("UTF-8").length + 3); + assertEquals(FIELD_UTF2_TEXT.length(), DocHelper.FIELD_2_TEXT.length() + 1); + assertEquals(FIELD_UTF2_TEXT.getBytes("UTF-8").length, DocHelper.FIELD_2_TEXT.getBytes("UTF-8").length + 3); + + Field field = reader.docField(0,TEXT_FIELD_UTF1_KEY); + assertTrue(field != null); + assertEquals(FIELD_UTF1_TEXT, field.stringValue()); + + field = reader.docField(0,TEXT_FIELD_UTF2_KEY); + assertTrue(field != null); + assertTrue(field.isTermVectorStored() == true); + + assertTrue(field.isStoreOffsetWithTermVector() == true); + assertTrue(field.isStorePositionWithTermVector() == true); + assertTrue(field.getOmitNorms() == false); + assertEquals(FIELD_UTF2_TEXT, field.stringValue()); + + reader.close(); + } + + public void testDocFieldString() throws IOException { + assertTrue(dir != null); + assertTrue(fieldInfos != null); + FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + assertTrue(reader != null); + assertTrue(reader.size() == 1); + + assertEquals(DocHelper.FIELD_1_TEXT, reader.docFieldString(0,"textField1")); + assertEquals(DocHelper.FIELD_2_TEXT, reader.docFieldString(0,"textField2")); + assertEquals(DocHelper.FIELD_3_TEXT, reader.docFieldString(0, "textField3")); + assertEquals(FIELD_UTF1_TEXT, reader.docFieldString(0,TEXT_FIELD_UTF1_KEY)); + assertEquals(FIELD_UTF2_TEXT, reader.docFieldString(0,TEXT_FIELD_UTF2_KEY)); + + assertTrue(reader.docFieldString(0,"doesnotexist") == null); reader.close(); } Index: src/java/org/apache/lucene/index/MultiReader.java =================================================================== --- src/java/org/apache/lucene/index/MultiReader.java (revision 382277) +++ src/java/org/apache/lucene/index/MultiReader.java (working copy) @@ -104,6 +104,16 @@ return subReaders[i].document(n - starts[i]); // dispatch to segment reader } + public Field getDocField(int n, String field) throws IOException { + int i = readerIndex(n); // find segment num + return subReaders[i].getDocField(n - starts[i], field); // dispatch to segment reader + } + + public String getDocFieldString(int n, String field) throws IOException { + int i = readerIndex(n); // find segment num + return subReaders[i].getDocFieldString(n - starts[i], field); // dispatch to segment reader + } + public boolean isDeleted(int n) { int i = readerIndex(n); // find segment num return subReaders[i].isDeleted(n - starts[i]); // dispatch to segment reader Index: src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- src/java/org/apache/lucene/index/FieldsReader.java (revision 382277) +++ src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -57,6 +57,11 @@ return size; } + /** + * Retrieve a Document that contains all of the fields defined in the index + * @param n the document number + * @return + */ final Document doc(int n) throws IOException { indexStream.seek(n * 8L); long position = indexStream.readLong(); @@ -67,9 +72,22 @@ for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); + doc.add(getFieldFromStream(fi)); + } - byte bits = fieldsStream.readByte(); + return doc; + } + /** + * Retrive a particular field from fieldsStream, which is currently + * looking at the bits byte of a field. When finished, the fieldStream + * will be looking at the fieldNum of the next stored field. + * @param fi the FieldInfo for the field being examined (based on the + * previous vint in the stream) + * @return the Field. + */ + private Field getFieldFromStream(FieldInfo fi) throws IOException { + byte bits = fieldsStream.readByte(); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; @@ -77,9 +95,9 @@ final byte[] b = new byte[fieldsStream.readVInt()]; fieldsStream.readBytes(b, 0, b.length); if (compressed) - doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS)); + return new Field(fi.name, uncompress(b), Field.Store.COMPRESS); else - doc.add(new Field(fi.name, b, Field.Store.YES)); + return new Field(fi.name, b, Field.Store.YES); } else { Field.Index index; @@ -123,7 +141,7 @@ index, termVector); f.setOmitNorms(fi.omitNorms); - doc.add(f); + return f; } else { Field f = new Field(fi.name, // name @@ -132,13 +150,127 @@ index, termVector); f.setOmitNorms(fi.omitNorms); - doc.add(f); + return f; } } } - return doc; + + /** + * Retrieve the string value in the given document with the + * specified field name. If there are more than one field + * in the document with that field name, only the first one + * is returned. + * @param n the document to retrieve + * @param fieldName the name of the field to retrieve + * @return the first field in the document with that name, or null + * if the document doesn't have such a field stored. + * @see Document#getField(String) + * @throws IOException + */ + final Field docField(int n, String fieldName) throws IOException { + int fieldNo = fieldInfos.fieldNumber(fieldName); + if (fieldNo < 0) return null; + + // Seek to the start of all the fields + indexStream.seek(n * 8L); + long position = indexStream.readLong(); + fieldsStream.seek(position); + + int numFields = fieldsStream.readVInt(); + + for (int i = 0; i < numFields; i++) { + int fieldNumber = fieldsStream.readVInt(); + + if (fieldNumber == fieldNo) { + // This is the field we want + FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); + return getFieldFromStream(fi); + } else { + // This is the field we want to skip + byte bits = fieldsStream.readByte(); // The bits + boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; + boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; + + int dataLength = fieldsStream.readVInt(); // Length to skip; + + if (compressed || binary) { + fieldsStream.seek(fieldsStream.getFilePointer() + dataLength); + } else { + // If not compressed or binary, we store the number of chars, not number of bytes. + for (int j = 0; j < dataLength; j++) { + fieldsStream.readChar(); + } + } + continue; + } + } + // The field wasn't defined on the document, so ignore it. + return null; + } + + /** + * Retrieve the first field in the given document with the + * specified field name. If there are more than one field + * in the document with that field name, only the first one + * is returned. + * @param n the document to retrieve + * @param fieldName the name of the field to retrieve + * @return the string value of the field in the document with that name, or null + * if the document doesn't have such a field stored, or if the value stored + * is a binary value. + * @see Document#getField(String) + * @throws IOException + */ + final String docFieldString(int n, String fieldName) throws IOException { + int fieldNo = fieldInfos.fieldNumber(fieldName); + if (fieldNo < 0) return null; + + // Seek to the start of all the fields + indexStream.seek(n * 8L); + long position = indexStream.readLong(); + fieldsStream.seek(position); + + int numFields = fieldsStream.readVInt(); + + for (int i = 0; i < numFields; i++) { + int fieldNumber = fieldsStream.readVInt(); + byte bits = fieldsStream.readByte(); // The bits + boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; + boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; + + + if (fieldNumber == fieldNo) { + // This is the field we want to return + if (binary) return null; // Don't return binary + if (compressed) { + int dataLength = fieldsStream.readVInt(); // Length to skip; + final byte[] b = new byte[dataLength]; + fieldsStream.readBytes(b, 0, b.length); + return new String(uncompress(b), "UTF-8"); // uncompress the value and add as string + } else { + return fieldsStream.readString(); + } + } else { + // This is the field we want to skip + int dataLength = fieldsStream.readVInt(); // Length to skip; + if (compressed || binary) { + fieldsStream.seek(fieldsStream.getFilePointer() + dataLength); + } else { + // If not compressed or binary, we store the number of chars, not number of bytes. + for (int j = 0; j < dataLength; j++) { + fieldsStream.readChar(); + } + } + continue; + } + + } + // The field wasn't defined on the document, so ignore it. + return null; + } + private final byte[] uncompress(final byte[] input) throws IOException Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 382277) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -357,6 +357,36 @@ Document in this index. */ public abstract Document document(int n) throws IOException; + /** + * Return the document field for the given document. If querying for + * only one field on a document, implementations may make this more efficient + * that calling document(doc).getField(field). It will only return the + * first value of the field in the document, like {@link Document#getField(String)} + * + * Only use this function if you know there can be only one value for the field + * (like a document id), this is the only field you want, + * and you want to reduce the overhead of querying. + */ + public Field getDocField(int doc, String field) throws IOException { + return document(doc).getField(field); + } + + /** + * Return the document field for the given document. If querying for + * only one field on a document, implementations may make this more efficient + * that calling document(doc).getField(field). It will only return the + * first value of the field in the document, like {@link Document#getField(String)} + * + * Only use this function if you know there can be only one value for the field + * (like a document id), this is the only field you want, + * and you want to reduce the overhead of querying. + */ + public String getDocFieldString(int doc, String field) throws IOException { + Field result = document(doc).getField(field); + return result == null ? null : result.stringValue(); + } + + /** Returns true if document n has been deleted */ public abstract boolean isDeleted(int n); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 382277) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -51,6 +51,7 @@ // Compound File Reader when based on a compound file segment CompoundFileReader cfsReader = null; + private class Norm { public Norm(IndexInput in, int number) { @@ -284,6 +285,20 @@ return fieldsReader.doc(n); } + public Field getDocField(int doc, String fieldName) throws IOException { + if (isDeleted(doc)) + throw new IllegalArgumentException + ("attempt to access a deleted document"); + return fieldsReader.docField(doc, fieldName); + } + + public String getDocFieldString(int doc, String fieldName) throws IOException { + if (isDeleted(doc)) + throw new IllegalArgumentException + ("attempt to access a deleted document"); + return fieldsReader.docFieldString(doc, fieldName); +} + public synchronized boolean isDeleted(int n) { return (deletedDocs != null && deletedDocs.get(n)); } Index: src/java/org/apache/lucene/store/IndexInput.java =================================================================== --- src/java/org/apache/lucene/store/IndexInput.java (revision 382277) +++ src/java/org/apache/lucene/store/IndexInput.java (working copy) @@ -103,18 +103,26 @@ throws IOException { final int end = start + length; for (int i = start; i < end; i++) { + buffer[i] = readChar(); + } + } + + /** Reads a single UTF-8 encoded character + * @return the next character encoded as encoded in UTF-8 format. + * @see IndexOutput#writeChars(String,int,int) + */ + public final char readChar() throws IOException { byte b = readByte(); if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); + return (char)(b & 0x7F); else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) + return (char)(((b & 0x1F) << 6) | (readByte() & 0x3F)); } else - buffer[i] = (char)(((b & 0x0F) << 12) + return (char)(((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6) | (readByte() & 0x3F)); } - } /** Closes the stream to futher operations. */ public abstract void close() throws IOException;