### Eclipse Workspace Patch 1.0 #P Lucene Index: lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java (revision 1134546) +++ lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -40,6 +40,7 @@ import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.document2.FieldType; import org.apache.lucene.index.*; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; @@ -1072,7 +1073,24 @@ public static Field newField(String name, String value, Store store, Index index) { return newField(random, name, value, store, index); } + + public static org.apache.lucene.document2.Field newField(String name, String value, FieldType type) { + return newField(random, name, value, type); + } + public static org.apache.lucene.document2.Field newField(Random random, String name, String value, FieldType type) { + if (usually(random)) { + // most of the time, don't modify the params + return new org.apache.lucene.document2.Field(name, type, value); + } + + if (!type.stored() && random.nextBoolean()) { + type.setStored(true); // randomly store it + } + + return new org.apache.lucene.document2.Field(name, type, value); + } + /** * Returns a new Field instance. Use this when the test does not * care about some specific field settings (most tests) Index: lucene/src/test/org/apache/lucene/document2/TestDateTools.java =================================================================== --- lucene/src/test/org/apache/lucene/document2/TestDateTools.java (revision 0) +++ lucene/src/test/org/apache/lucene/document2/TestDateTools.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.document; +package org.apache.lucene.document2; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -8,6 +8,8 @@ import java.util.TimeZone; import java.util.Locale; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.util.LuceneTestCase; /** Index: lucene/src/test/org/apache/lucene/document2/TestDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document2/TestDocument.java (revision 0) +++ lucene/src/test/org/apache/lucene/document2/TestDocument.java (working copy) @@ -1,6 +1,9 @@ -package org.apache.lucene.document; +package org.apache.lucene.document2; +import org.apache.lucene.document2.Document; +import org.apache.lucene.document2.Field; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -37,20 +40,20 @@ public void testBinaryField() throws Exception { Document doc = new Document(); - Fieldable stringFld = new Field("string", binaryVal, Field.Store.YES, - Field.Index.NO); - Fieldable binaryFld = new Field("binary", binaryVal.getBytes()); - Fieldable binaryFld2 = new Field("binary", binaryVal2.getBytes()); + FieldType stringType = new FieldType().setStored(true); + IndexableField stringFld = new Field("string", stringType, binaryVal); + IndexableField binaryFld = new BinaryField("binary", binaryVal.getBytes()); + IndexableField binaryFld2 = new BinaryField("binary", binaryVal2.getBytes()); doc.add(stringFld); doc.add(binaryFld); assertEquals(2, doc.fields.size()); - assertTrue(binaryFld.isBinary()); - assertTrue(binaryFld.isStored()); - assertFalse(binaryFld.isIndexed()); - assertFalse(binaryFld.isTokenized()); + assertTrue(((Field) binaryFld).isBinary()); + assertTrue(binaryFld.stored()); + assertFalse(binaryFld.indexed()); + assertFalse(binaryFld.tokenized()); String binaryTest = new String(doc.getBinaryValue("binary")); assertTrue(binaryTest.equals(binaryVal)); @@ -115,10 +118,11 @@ } public void testConstructorExceptions() { - new Field("name", "value", Field.Store.YES, Field.Index.NO); // okay - new Field("name", "value", Field.Store.NO, Field.Index.NOT_ANALYZED); // okay + new Field("name", new FieldType().setStored(true), "value"); // okay + new Field("name", new FieldType().setIndexed(true), "value"); // okay + /* try { - new Field("name", "value", Field.Store.NO, Field.Index.NO); + new Field("name", new FieldType(), "value"); fail(); } catch (IllegalArgumentException e) { // expected exception @@ -132,6 +136,7 @@ } catch (IllegalArgumentException e) { // expected exception } + */ } /** @@ -174,20 +179,16 @@ private Document makeDocumentWithFields() { Document doc = new Document(); - doc.add(new Field("keyword", "test1", Field.Store.YES, - Field.Index.NOT_ANALYZED)); - doc.add(new Field("keyword", "test2", Field.Store.YES, - Field.Index.NOT_ANALYZED)); - doc.add(new Field("text", "test1", Field.Store.YES, Field.Index.ANALYZED)); - doc.add(new Field("text", "test2", Field.Store.YES, Field.Index.ANALYZED)); - doc.add(new Field("unindexed", "test1", Field.Store.YES, Field.Index.NO)); - doc.add(new Field("unindexed", "test2", Field.Store.YES, Field.Index.NO)); + doc.add(new StringField("keyword", "test1")); + doc.add(new StringField("keyword", "test2")); + doc.add(new Field("text", TextField.DEFAULT_TYPE.setStored(true), "test1")); + doc.add(new Field("text", TextField.DEFAULT_TYPE.setStored(true), "test2")); + doc.add(new Field("unindexed", new FieldType().setStored(true), "test1")); + doc.add(new Field("unindexed", new FieldType().setStored(true), "test2")); doc - .add(new Field("unstored", "test1", Field.Store.NO, - Field.Index.ANALYZED)); + .add(new TextField("unstored", "test1")); doc - .add(new Field("unstored", "test2", Field.Store.NO, - Field.Index.ANALYZED)); + .add(new TextField("unstored", "test2")); return doc; } @@ -222,12 +223,10 @@ public void testFieldSetValue() throws Exception { - Field field = new Field("id", "id1", Field.Store.YES, - Field.Index.NOT_ANALYZED); + Field field = new Field("id", StringField.DEFAULT_TYPE.setStored(true), "id1"); Document doc = new Document(); doc.add(field); - doc.add(new Field("keyword", "test", Field.Store.YES, - Field.Index.NOT_ANALYZED)); + doc.add(new Field("keyword", StringField.DEFAULT_TYPE.setStored(true), "test")); Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random, dir); @@ -262,9 +261,8 @@ } public void testFieldSetValueChangeBinary() { - Field field1 = new Field("field1", new byte[0]); - Field field2 = new Field("field2", "", Field.Store.YES, - Field.Index.ANALYZED); + Field field1 = new BinaryField("field1", new byte[0]); + Field field2 = new Field("field2", TextField.DEFAULT_TYPE.setStored(true), ""); try { field1.setValue("abc"); fail("did not hit expected exception"); Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 1134546) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -17,7 +17,7 @@ * limitations under the License. */ -import org.apache.lucene.document.Document; +import org.apache.lucene.document2.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.FieldCache; // javadocs import org.apache.lucene.search.Similarity; Index: lucene/src/java/org/apache/lucene/document2/NumericField.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/NumericField.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/NumericField.java (revision 0) @@ -0,0 +1,369 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.document.NumericField.DataType; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.search.NumericRangeQuery; // javadocs +import org.apache.lucene.search.NumericRangeFilter; // javadocs +import org.apache.lucene.search.SortField; // javadocs +import org.apache.lucene.search.FieldCache; // javadocs + +/** + *

+ * This class provides a {@link Field} that enables indexing of numeric values + * for efficient range filtering and sorting. Here's an example usage, adding an + * int value: + * + *

+ * document.add(new NumericField(name).setIntValue(value));
+ * 
+ * + * For optimal performance, re-use the NumericField and + * {@link Document} instance for more than one document: + * + *
+ *  NumericField field = new NumericField(name);
+ *  Document document = new Document();
+ *  document.add(field);
+ * 
+ *  for(all documents) {
+ *    ...
+ *    field.setIntValue(value)
+ *    writer.addDocument(document);
+ *    ...
+ *  }
+ * 
+ * + *

+ * The java native types int, long, float + * and double are directly supported. However, any value that can + * be converted into these native types can also be indexed. For example, + * date/time values represented by a {@link java.util.Date} can be translated + * into a long value using the {@link java.util.Date#getTime} method. If you + * don't need millisecond precision, you can quantize the value, either by + * dividing the result of {@link java.util.Date#getTime} or using the separate + * getters (for year, month, etc.) to construct an int or + * long value. + *

+ * + *

+ * To perform range querying or filtering against a NumericField, + * use {@link NumericRangeQuery} or {@link NumericRangeFilter}. To sort + * according to a NumericField, use the normal numeric sort types, + * eg {@link SortField#INT}. NumericField values can also be loaded + * directly from {@link FieldCache}. + *

+ * + *

+ * By default, a NumericField's value is not stored but is indexed + * for range filtering and sorting. You can use the + * {@link #NumericField(String,Field.Store,boolean)} constructor if you need to + * change these defaults. + *

+ * + *

+ * You may add the same field name as a NumericField to the same + * document more than once. Range querying and filtering will be the logical OR + * of all values; so a range query will hit all documents that have at least one + * value in the range. However sort behavior is not defined. If you need to + * sort, you should separately index a single-valued NumericField. + *

+ * + *

+ * A NumericField will consume somewhat more disk space in the + * index than an ordinary single-valued field. However, for a typical index that + * includes substantial textual content per document, this increase will likely + * be in the noise. + *

+ * + *

+ * Within Lucene, each numeric value is indexed as a trie structure, + * where each term is logically assigned to larger and larger pre-defined + * brackets (which are simply lower-precision representations of the value). The + * step size between each successive bracket is called the + * precisionStep, measured in bits. Smaller + * precisionStep values result in larger number of brackets, which + * consumes more disk space in the index but may result in faster range search + * performance. The default value, 4, was selected for a reasonable tradeoff of + * disk space consumption versus performance. You can use the expert constructor + * {@link #NumericField(String,int,Field.Store,boolean)} if you'd like to change + * the value. Note that you must also specify a congruent value when creating + * {@link NumericRangeQuery} or {@link NumericRangeFilter}. For low cardinality + * fields larger precision steps are good. If the cardinality is < 100, it is + * fair to use {@link Integer#MAX_VALUE}, which produces one term per value. + * + *

+ * For more information on the internals of numeric trie indexing, including the + * + * precisionStep configuration, see {@link NumericRangeQuery}. + * The format of indexed values is described in {@link NumericUtils}. + * + *

+ * If you only need to sort by numeric value, and never run range + * querying/filtering, you can index using a precisionStep of + * {@link Integer#MAX_VALUE}. This will minimize disk space consumed. + *

+ * + *

+ * More advanced users can instead use {@link NumericTokenStream} directly, when + * indexing numbers. This class is a wrapper around this token stream type for + * easier, more intuitive usage. + *

+ * + * @since 2.9 + */ +public final class NumericField extends Field { + + /** + * Data type of the value in {@link NumericField}. + * + * @since 3.2 + */ + + /* + public static enum DataType { + INT, LONG, FLOAT, DOUBLE + } + */ + + public static final FieldType DEFAULT_TYPE = new FieldType() + .setIndexed(true) + .setOmitNorms(true) + .setOmitTermFreqAndPositions(true); + + private org.apache.lucene.document.NumericField.DataType dataType; + private transient NumericTokenStream numericTS; + private final int precisionStep; + + /** + * Creates a field for numeric values using the default + * precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * The instance is not yet initialized with a numeric value, before indexing a + * document containing this field, set a value using the various set + * ???Value() methods. This constructor creates an indexed, but not + * stored field. + * + * @param name + * the field name + */ + public NumericField(String name) { + this(name, NumericUtils.PRECISION_STEP_DEFAULT, NumericField.DEFAULT_TYPE); + } + + /** + * Creates a field for numeric values using the default + * precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). + * The instance is not yet initialized with a numeric value, before indexing a + * document containing this field, set a value using the various set + * ???Value() methods. + * + * @param name + * the field name + * @param store + * if the field should be stored, {@link Document#getFieldable} then + * returns {@code NumericField} instances on search results. + * @param index + * if the field should be indexed using {@link NumericTokenStream} + */ + public NumericField(String name, FieldType type) { + this(name, NumericUtils.PRECISION_STEP_DEFAULT, type); + } + + /** + * Creates a field for numeric values with the specified + * precisionStep. The instance is not yet initialized with a + * numeric value, before indexing a document containing this field, set a + * value using the various set???Value() methods. This constructor + * creates an indexed, but not stored field. + * + * @param name + * the field name + * @param precisionStep + * the used precision step + */ + public NumericField(String name, int precisionStep) { + this(name, precisionStep, NumericField.DEFAULT_TYPE); + } + + /** + * Creates a field for numeric values with the specified + * precisionStep. The instance is not yet initialized with a + * numeric value, before indexing a document containing this field, set a + * value using the various set???Value() methods. + * + * @param name + * the field name + * @param precisionStep + * the used precision step + * @param store + * if the field should be stored, {@link Document#getFieldable} then + * returns {@code NumericField} instances on search results. + * @param index + * if the field should be indexed using {@link NumericTokenStream} + */ + public NumericField(String name, int precisionStep, FieldType type) { + super(name, type); + this.precisionStep = precisionStep; + } + + /** Returns a {@link NumericTokenStream} for indexing the numeric value. */ + public TokenStream tokenStreamValue() { + if (!indexed()) return null; + if (numericTS == null) { + // lazy init the TokenStream as it is heavy to instantiate + // (attributes,...), + // if not needed (stored field loading) + numericTS = new NumericTokenStream(precisionStep); + // initialize value in TokenStream + if (fieldsData != null) { + assert dataType != null; + final Number val = (Number) fieldsData; + switch (dataType) { + case INT: + numericTS.setIntValue(val.intValue()); + break; + case LONG: + numericTS.setLongValue(val.longValue()); + break; + case FLOAT: + numericTS.setFloatValue(val.floatValue()); + break; + case DOUBLE: + numericTS.setDoubleValue(val.doubleValue()); + break; + default: + assert false : "Should never get here"; + } + } + } + return numericTS; + } + + /** Returns always null for numeric fields */ + public Reader readerValue() { + return null; + } + + /** + * Returns the numeric value as a string. This format is also returned if you + * call {@link Document#get(String)} on search results. It is recommended to + * use {@link Document#getFieldable} instead that returns {@code NumericField} + * instances. You can then use {@link #getNumericValue} to return the stored + * value. + */ + public String stringValue() { + return (fieldsData == null) ? null : fieldsData.toString(); + } + + /** + * Returns the current numeric value as a subclass of {@link Number}, + * null if not yet initialized. + */ + public Number getNumericValue() { + return (Number) fieldsData; + } + + /** Returns the precision step. */ + public int getPrecisionStep() { + return precisionStep; + } + + /** + * Returns the data type of the current value, {@code null} if not yet set. + * + * @since 3.2 + */ + public DataType getNumericDataType() { + return dataType; + } + + public boolean isNumeric() { + return true; + } + + /** + * Initializes the field with the supplied long value. + * + * @param value + * the numeric value + * @return this instance, because of this you can use it the following way: + * document.add(new NumericField(name, precisionStep).setLongValue(value)) + */ + public NumericField setLongValue(final long value) { + if (numericTS != null) numericTS.setLongValue(value); + fieldsData = Long.valueOf(value); + dataType = DataType.LONG; + return this; + } + + /** + * Initializes the field with the supplied int value. + * + * @param value + * the numeric value + * @return this instance, because of this you can use it the following way: + * document.add(new NumericField(name, precisionStep).setIntValue(value)) + */ + public NumericField setIntValue(final int value) { + if (numericTS != null) numericTS.setIntValue(value); + fieldsData = Integer.valueOf(value); + dataType = DataType.INT; + return this; + } + + /** + * Initializes the field with the supplied double value. + * + * @param value + * the numeric value + * @return this instance, because of this you can use it the following way: + * document.add(new NumericField(name, precisionStep).setDoubleValue(value)) + */ + public NumericField setDoubleValue(final double value) { + if (numericTS != null) numericTS.setDoubleValue(value); + fieldsData = Double.valueOf(value); + dataType = DataType.DOUBLE; + return this; + } + + /** + * Initializes the field with the supplied float value. + * + * @param value + * the numeric value + * @return this instance, because of this you can use it the following way: + * document.add(new NumericField(name, precisionStep).setFloatValue(value)) + */ + public NumericField setFloatValue(final float value) { + if (numericTS != null) numericTS.setFloatValue(value); + fieldsData = Float.valueOf(value); + dataType = DataType.FLOAT; + return this; + } + +} Index: lucene/src/java/org/apache/lucene/document2/package.html =================================================================== --- lucene/src/java/org/apache/lucene/document2/package.html (revision 0) +++ lucene/src/java/org/apache/lucene/document2/package.html (revision 0) @@ -0,0 +1,56 @@ + + + + + + + +

The logical representation of a {@link org.apache.lucene.document.Document} for indexing and searching.

+

The document package provides the user level logical representation of content to be indexed and searched. The +package also provides utilities for working with {@link org.apache.lucene.document.Document}s and {@link org.apache.lucene.document.Fieldable}s.

+

Document and Fieldable

+

A {@link org.apache.lucene.document.Document} is a collection of {@link org.apache.lucene.document.Fieldable}s. A + {@link org.apache.lucene.document.Fieldable} is a logical representation of a user's content that needs to be indexed or stored. + {@link org.apache.lucene.document.Fieldable}s have a number of properties that tell Lucene how to treat the content (like indexed, tokenized, + stored, etc.) See the {@link org.apache.lucene.document.Field} implementation of {@link org.apache.lucene.document.Fieldable} + for specifics on these properties. +

+

Note: it is common to refer to {@link org.apache.lucene.document.Document}s having {@link org.apache.lucene.document.Field}s, even though technically they have +{@link org.apache.lucene.document.Fieldable}s.

+

Working with Documents

+

First and foremost, a {@link org.apache.lucene.document.Document} is something created by the user application. It is your job + to create Documents based on the content of the files you are working with in your application (Word, txt, PDF, Excel or any other format.) + How this is done is completely up to you. That being said, there are many tools available in other projects that can make + the process of taking a file and converting it into a Lucene {@link org.apache.lucene.document.Document}. To see an example of this, + take a look at the Lucene demo and the associated source code + for extracting content from HTML. +

+

The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times searchable +(remember, Lucene only searches text). {@link org.apache.lucene.document.NumericField} is a special helper class +to simplify indexing of numeric values (and also dates) for fast range range queries with {@link org.apache.lucene.search.NumericRangeQuery} +(using a special sortable string representation of numeric values).

+

The {@link org.apache.lucene.document.FieldSelector} class provides a mechanism to tell Lucene how to load Documents from +storage. If no FieldSelector is used, all Fieldables on a Document will be loaded. As an example of the FieldSelector usage, consider + the common use case of +displaying search results on a web page and then having users click through to see the full document. In this scenario, it is often + the case that there are many small fields and one or two large fields (containing the contents of the original file). Before the FieldSelector, +the full Document had to be loaded, including the large fields, in order to display the results. Now, using the FieldSelector, one +can {@link org.apache.lucene.document.FieldSelectorResult#LAZY_LOAD} the large fields, thus only loading the large fields +when a user clicks on the actual link to view the original content.

+ + Index: lucene/src/test/org/apache/lucene/document2/TestBinaryDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document2/TestBinaryDocument.java (revision 0) +++ lucene/src/test/org/apache/lucene/document2/TestBinaryDocument.java (working copy) @@ -1,7 +1,14 @@ -package org.apache.lucene.document; +package org.apache.lucene.document2; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.document.CompressionTools; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.store.Directory; Index: lucene/src/java/org/apache/lucene/document2/TextField.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/TextField.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/TextField.java (revision 0) @@ -0,0 +1,39 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +public final class TextField extends Field { + + public static final FieldType DEFAULT_TYPE = new FieldType() + .setIndexed(true) + .setTokenized(true); + + public TextField(String name, Reader reader) { + super(name, TextField.DEFAULT_TYPE, reader); + } + + public TextField(String name, String value) { + super(name, TextField.DEFAULT_TYPE, value); + } + + public boolean isNumeric() { + return false; + } +} Index: lucene/src/java/org/apache/lucene/document2/MapFieldSelector.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/MapFieldSelector.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/MapFieldSelector.java (revision 0) @@ -0,0 +1,67 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * A {@link FieldSelector} based on a Map of field names to {@link FieldSelectorResult}s + * + */ +public class MapFieldSelector implements FieldSelector { + + Map fieldSelections; + + /** Create a a MapFieldSelector + * @param fieldSelections maps from field names (String) to {@link FieldSelectorResult}s + */ + public MapFieldSelector(Map fieldSelections) { + this.fieldSelections = fieldSelections; + } + + /** Create a a MapFieldSelector + * @param fields fields to LOAD. List of Strings. All other fields are NO_LOAD. + */ + public MapFieldSelector(List fields) { + fieldSelections = new HashMap(fields.size()*5/3); + for (final String field : fields) + fieldSelections.put(field, FieldSelectorResult.LOAD); + } + + /** Create a a MapFieldSelector + * @param fields fields to LOAD. All other fields are NO_LOAD. + */ + public MapFieldSelector(String... fields) { + this(Arrays.asList(fields)); + } + + + + /** Load field according to its associated value in fieldSelections + * @param field a field name + * @return the fieldSelections value that field maps to or NO_LOAD if none. + */ + public FieldSelectorResult accept(String field) { + FieldSelectorResult selection = fieldSelections.get(field); + return selection!=null ? selection : FieldSelectorResult.NO_LOAD; + } + +} Index: lucene/src/java/org/apache/lucene/document2/FieldType.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/FieldType.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/FieldType.java (revision 0) @@ -0,0 +1,157 @@ +package org.apache.lucene.document2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FieldType { + + private boolean indexed; + private boolean stored; + private boolean tokenized; + private boolean storeTermVectors; + private boolean storeTermVectorsOffsets; + private boolean storeTermVectorsPositions; + private boolean omitNorms; + private boolean omitTermFreqsAndPositions; + private boolean lazy; + + public boolean indexed() { + return this.indexed; + } + + public FieldType setIndexed(boolean value) { + this.indexed = value; + return this; + } + + public boolean stored() { + return this.stored; + } + + public FieldType setStored(boolean value) { + this.stored = value; + return this; + } + + public boolean tokenized() { + return this.tokenized; + } + + public FieldType setTokenized(boolean value) { + this.tokenized = value; + return this; + } + + public boolean storeTermVectors() { + return this.storeTermVectors; + } + + public FieldType setStoreTermVectors(boolean value) { + this.storeTermVectors = value; + return this; + } + + public boolean storeTermVectorOffsets() { + return this.storeTermVectorsOffsets; + } + + public FieldType setStoreTermVectorOffsets(boolean value) { + this.storeTermVectorsOffsets = value; + return this; + } + + public boolean storeTermVectorPositions() { + return this.storeTermVectorsPositions; + } + + public FieldType setStoreTermVectorPositions(boolean value) { + this.storeTermVectorsPositions = value; + return this; + } + + public boolean omitNorms() { + return this.omitNorms; + } + + public FieldType setOmitNorms(boolean value) { + this.omitNorms = value; + return this; + } + + public boolean omitTermFreqAndPositions() { + return this.omitTermFreqsAndPositions; + } + + public FieldType setOmitTermFreqAndPositions(boolean value) { + this.omitTermFreqsAndPositions = value; + return this; + } + + public boolean lazy() { + return this.lazy; + } + + public FieldType setLazy(boolean value) { + this.lazy = value; + return this; + } + + /** Prints a Field for human consumption. */ + @Override + public final String toString() { + StringBuilder result = new StringBuilder(); + if (stored()) { + result.append("stored"); + } + if (indexed()) { + if (result.length() > 0) + result.append(","); + result.append("indexed"); + } + if (tokenized()) { + if (result.length() > 0) + result.append(","); + result.append("tokenized"); + } + if (storeTermVectors()) { + if (result.length() > 0) + result.append(","); + result.append("termVector"); + } + if (storeTermVectorOffsets()) { + if (result.length() > 0) + result.append(","); + result.append("termVectorOffsets"); + } + if (storeTermVectorPositions()) { + if (result.length() > 0) + result.append(","); + result.append("termVectorPosition"); + } + if (omitNorms()) { + result.append(",omitNorms"); + } + if (omitTermFreqAndPositions()) { + result.append(",omitTermFreqAndPositions"); + } + if (lazy()){ + result.append(",lazy"); + } + + return result.toString(); + } +} Index: lucene/src/java/org/apache/lucene/document2/SetBasedFieldSelector.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/SetBasedFieldSelector.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/SetBasedFieldSelector.java (revision 0) @@ -0,0 +1,58 @@ +package org.apache.lucene.document2; + +import java.util.Set; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Declare what fields to load normally and what fields to load lazily + * + **/ +public class SetBasedFieldSelector implements FieldSelector { + + private Set fieldsToLoad; + private Set lazyFieldsToLoad; + + /** + * Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the + * Document will not have any {@link Field} on it. + * @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null + * @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null + */ + public SetBasedFieldSelector(Set fieldsToLoad, Set lazyFieldsToLoad) { + this.fieldsToLoad = fieldsToLoad; + this.lazyFieldsToLoad = lazyFieldsToLoad; + } + + /** + * Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the + * initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name + * is in both fieldsToLoad and lazyFieldsToLoad, lazy has precedence. + * + * @param fieldName The {@link Field} name to check + * @return The {@link FieldSelectorResult} + */ + public FieldSelectorResult accept(String fieldName) { + FieldSelectorResult result = FieldSelectorResult.NO_LOAD; + if (fieldsToLoad.contains(fieldName) == true){ + result = FieldSelectorResult.LOAD; + } + if (lazyFieldsToLoad.contains(fieldName) == true){ + result = FieldSelectorResult.LAZY_LOAD; + } + return result; + } +} \ No newline at end of file Index: lucene/src/java/org/apache/lucene/index/IndexableField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexableField.java (revision 1134546) +++ lucene/src/java/org/apache/lucene/index/IndexableField.java (working copy) @@ -55,6 +55,8 @@ public float boost(); public boolean stored(); + + public boolean lazy(); // nocommit -- isBinary? public BytesRef binaryValue(BytesRef reuse); Index: lucene/src/java/org/apache/lucene/document2/Field.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/Field.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/Field.java (revision 0) @@ -0,0 +1,353 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.StringHelper; + +/** + * A field is a section of a Document. Each field has two parts, a name and a + * value. Values may be free text, provided as a String or as a Reader, or they + * may be atomic keywords, which are not further processed. Such keywords may be + * used to represent dates, urls, etc. Fields are optionally stored in the + * index, so that they may be returned with hits on the document. + */ + +public class Field implements IndexableField { + + protected FieldType type; + protected String name = "body"; + // the data object for all different kind of field values + protected Object fieldsData = null; + // pre-analyzed tokenStream for indexed fields + protected TokenStream tokenStream; + protected boolean isBinary = false; + // length/offset for all primitive types + protected int binaryLength; + protected int binaryOffset; + + protected float boost = 1.0f; + + public Field(String name, FieldType type) { + + } + + public Field(String name, FieldType type, Reader reader) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (reader == null) + throw new NullPointerException("reader cannot be null"); + + this.name = StringHelper.intern(name); // field names are interned + this.fieldsData = reader; + } + + public Field(String name, FieldType type, TokenStream tokenStream) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (tokenStream == null) + throw new NullPointerException("tokenStream cannot be null"); + + this.name = StringHelper.intern(name); // field names are interned + this.fieldsData = null; + this.tokenStream = tokenStream; + } + + public Field(String name, FieldType type, byte[] value) { + this(name, type, value, 0, value.length); + } + + public Field(String name, FieldType type, byte[] value, int offset, int length) { + this.isBinary = true; + this.fieldsData = value; + this.type = type; + this.binaryOffset = offset; + this.binaryLength = length; + this.name = StringHelper.intern(name); + } + + public Field(String name, FieldType type, String value) { + this(name, true, type, value); + } + + public Field(String name, boolean internName, FieldType type, String value) { + if (name == null) + throw new IllegalArgumentException("name cannot be null"); + if (value == null) + throw new IllegalArgumentException("value cannot be null"); + + this.type = type; + this.name = name; + this.fieldsData = value; + + if (internName) // field names are optionally interned + name = StringHelper.intern(name); + } + + /** + * The value of the field as a String, or null. If null, the Reader value or + * binary value is used. Exactly one of stringValue(), readerValue(), and + * getBinaryValue() must be set. + */ + public String stringValue() { + return fieldsData instanceof String ? (String) fieldsData : null; + } + + /** + * The value of the field as a Reader, or null. If null, the String value or + * binary value is used. Exactly one of stringValue(), readerValue(), and + * getBinaryValue() must be set. + */ + public Reader readerValue() { + return fieldsData instanceof Reader ? (Reader) fieldsData : null; + } + + /** + * The TokesStream for this field to be used when indexing, or null. If null, + * the Reader value or String value is analyzed to produce the indexed tokens. + */ + public TokenStream tokenStreamValue() { + return tokenStream; + } + + public Number getNumericValue() { + return null; + } + + /** + *

+ * Expert: change the value of this field. This can be used during indexing to + * re-use a single Field instance to improve indexing speed by avoiding GC + * cost of new'ing and reclaiming Field instances. Typically a single + * {@link Document} instance is re-used as well. This helps most on small + * documents. + *

+ * + *

+ * Each Field instance should only be used once within a single + * {@link Document} instance. See ImproveIndexingSpeed for details. + *

+ */ + public void setValue(String value) { + if (isBinary) { + throw new IllegalArgumentException( + "cannot set a String value on a binary field"); + } + fieldsData = value; + } + + /** + * Expert: change the value of this field. See setValue(String). + */ + public void setValue(Reader value) { + if (isBinary) { + throw new IllegalArgumentException( + "cannot set a Reader value on a binary field"); + } + if (stored()) { + throw new IllegalArgumentException( + "cannot set a Reader value on a stored field"); + } + fieldsData = value; + } + + /** + * Expert: change the value of this field. See setValue(String). + */ + public void setValue(byte[] value) { + if (!isBinary) { + throw new IllegalArgumentException( + "cannot set a byte[] value on a non-binary field"); + } + fieldsData = value; + binaryLength = value.length; + binaryOffset = 0; + } + + /** + * Expert: change the value of this field. See setValue(String). + */ + public void setValue(byte[] value, int offset, int length) { + if (!isBinary) { + throw new IllegalArgumentException( + "cannot set a byte[] value on a non-binary field"); + } + fieldsData = value; + binaryLength = length; + binaryOffset = offset; + } + + /** + * Expert: sets the token stream to be used for indexing and causes + * isIndexed() and isTokenized() to return true. May be combined with stored + * values from stringValue() or getBinaryValue() + */ + public void setTokenStream(TokenStream tokenStream) { + if (!indexed() || !tokenized()) { + throw new IllegalArgumentException( + "cannot set token stream on non indexed and tokenized field"); + } + this.tokenStream = tokenStream; + } + + public String name() { + return name; + } + + public float boost() { + return boost; + } + + /** + * Sets the boost factor hits on this field. This value will be multiplied + * into the score of all hits on this this field of this document. + * + *

+ * The boost is multiplied by + * {@link org.apache.lucene.document.Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then used to + * compute the norm factor for the field. By default, in the + * {@link org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} + * method, the boost value is multiplied by the length normalization factor + * and then rounded by + * {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before + * it is stored in the index. One should attempt to ensure that this product + * does not overflow the range of that encoding. + * + * @see org.apache.lucene.document.Document#setBoost(float) + * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) + * @see org.apache.lucene.search.Similarity#encodeNormValue(float) + */ + public void setBoost(float boost) { + this.boost = boost; + } + + private byte[] getBinaryValue(byte[] result /* unused */) { + if (isBinary || fieldsData instanceof byte[]) return (byte[]) fieldsData; + else return null; + } + + public boolean numeric() { + return false; + } + + public Number numericValue() { + return null; + } + + public NumericField.DataType numericDataType() { + return null; + } + + private byte[] getBinaryValue() { + return getBinaryValue(null); + } + + public BytesRef binaryValue(BytesRef reuse) { + final byte[] bytes = getBinaryValue(); + if (bytes != null) { + if (reuse == null) { + return new BytesRef(bytes, getBinaryOffset(), getBinaryLength()); + } else { + reuse.bytes = bytes; + reuse.offset = getBinaryOffset(); + reuse.length = getBinaryLength(); + return reuse; + } + } else { + return null; + } + } + + /** + * Returns length of byte[] segment that is used as value, if Field is not + * binary returned value is undefined + * + * @return length of byte[] segment that represents this Field value + */ + private int getBinaryLength() { + if (isBinary) { + return binaryLength; + } else if (fieldsData instanceof byte[]) return ((byte[]) fieldsData).length; + else return 0; + } + + /** + * Returns offset into byte[] segment that is used as value, if Field is not + * binary returned value is undefined + * + * @return index of the first character in byte[] segment that represents this + * Field value + */ + public int getBinaryOffset() { + return binaryOffset; + } + + public boolean isBinary() { + return isBinary; + } + + /** methods from inner FieldType */ + + public boolean stored() { + return type.stored(); + } + + public boolean indexed() { + return type.indexed(); + } + + public boolean tokenized() { + return type.tokenized(); + } + + public boolean omitNorms() { + return type.omitNorms(); + } + + public boolean omitTermFreqAndPositions() { + return type.omitTermFreqAndPositions(); + } + + public boolean storeTermVectors() { + return type.storeTermVectors(); + } + + public boolean storeTermVectorOffsets() { + return type.storeTermVectorOffsets(); + } + + public boolean storeTermVectorPositions() { + return type.storeTermVectorPositions(); + } + + public boolean lazy() { + return type.lazy(); + } +} Index: lucene/src/java/org/apache/lucene/document2/FieldSelectorResult.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/FieldSelectorResult.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/FieldSelectorResult.java (revision 0) @@ -0,0 +1,76 @@ +package org.apache.lucene.document2; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Provides information about what should be done with this Field + * + **/ +public enum FieldSelectorResult { + + /** + * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encountered. + * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + LOAD, + + /** + * Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until + * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should + * return a valid instance of a {@link Fieldable}. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + LAZY_LOAD, + + /** + * Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null. + * {@link Document#add(Fieldable)} is not called. + *

+ * {@link Document#add(Fieldable)} should not be called by the Reader. + */ + NO_LOAD, + + /** + * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the + * Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should + * both be valid for this {@link Field} + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + LOAD_AND_BREAK, + + /** Expert: Load the size of this {@link Field} rather than its value. + * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value. + * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0] + */ + SIZE, + + /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */ + SIZE_AND_BREAK, + + /** + * Lazily load this {@link Field}, but do not cache the result. This means the {@link Field} is valid, but it may not actually contain its data until + * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should + * return a valid instance of a {@link Fieldable}. + *

+ * {@link Document#add(Fieldable)} should be called by the Reader. + */ + LATENT +} Index: lucene/src/java/org/apache/lucene/document2/Document.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/Document.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/Document.java (revision 0) @@ -0,0 +1,346 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.*; + +import org.apache.lucene.index.IndexReader; // for javadoc +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.IndexSearcher; // for javadoc +import org.apache.lucene.search.ScoreDoc; // for javadoc + +/** Documents are the unit of indexing and search. + * + * A Document is a set of fields. Each field has a name and a textual value. + * A field may be {@link Fieldable#isStored() stored} with the document, in which + * case it is returned with search hits on the document. Thus each document + * should typically contain one or more stored fields which uniquely identify + * it. + * + *

Note that fields which are not {@link Fieldable#isStored() stored} are + * not available in documents retrieved from the index, e.g. with {@link + * ScoreDoc#doc} or {@link IndexReader#document(int)}. + */ + +public final class Document implements Iterable { + + List fields = new ArrayList(); + private float boost = 1.0f; + + /** Constructs a new document with no fields. */ + public Document() {} + + // @Override not until Java 1.6 + public Iterator iterator() { + // nocommit this shim code is temporary!! only here as an + // example... we will fix it "properly" for LUCENE-2308 + + // nocommit -- must multiply in docBoost to each + // provided field + + return new Iterator() { + private int fieldUpto = 0; + + public boolean hasNext() { + return fieldUpto < fields.size(); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public IndexableField next() { + return fields.get(fieldUpto++); + } + }; + } + + /** Sets a boost factor for hits on any field of this document. This value + * will be multiplied into the score of all hits on this document. + * + *

The default value is 1.0. + * + *

Values are multiplied into the value of {@link IndexableField#getBoost()} of + * each field in this document. Thus, this method in effect sets a default + * boost for the fields of this document. + * + * @see IndexableField#setBoost(float) + */ + public void setBoost(float boost) { + this.boost = boost; + } + + /** Returns, at indexing time, the boost factor as set by {@link #setBoost(float)}. + * + *

Note that once a document is indexed this value is no longer available + * from the index. At search time, for retrieved documents, this method always + * returns 1. This however does not mean that the boost value set at indexing + * time was ignored - it was just combined with other indexing time factors and + * stored elsewhere, for better indexing and search performance. (For more + * information see the "norm(t,d)" part of the scoring formula in + * {@link org.apache.lucene.search.Similarity Similarity}.) + * + * @see #setBoost(float) + */ + // @Override not until Java 1.6 + public float getBoost() { + return boost; + } + + /** + *

Adds a field to a document. Several fields may be added with + * the same name. In this case, if the fields are indexed, their text is + * treated as though appended for the purposes of search.

+ *

Note that add like the removeField(s) methods only makes sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.

+ */ + public final void add(IndexableField field) { + fields.add(field); + } + + /** + *

Removes field with the specified name from the document. + * If multiple fields exist with this name, this method removes the first field that has been added. + * If there is no field with the specified name, the document remains unchanged.

+ *

Note that the removeField(s) methods like the add method only make sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.

+ */ + public final void removeField(String name) { + Iterator it = fields.iterator(); + while (it.hasNext()) { + IndexableField field = it.next(); + if (field.name().equals(name)) { + it.remove(); + return; + } + } + } + + /** + *

Removes all fields with the given name from the document. + * If there is no field with the specified name, the document remains unchanged.

+ *

Note that the removeField(s) methods like the add method only make sense + * prior to adding a document to an index. These methods cannot + * be used to change the content of an existing index! In order to achieve this, + * a document has to be deleted from an index and a new changed version of that + * document has to be added.

+ */ + public final void removeFields(String name) { + Iterator it = fields.iterator(); + while (it.hasNext()) { + IndexableField field = it.next(); + if (field.name().equals(name)) { + it.remove(); + } + } + } + + /** Returns a field with the given name if any exist in this document, or + * null. If multiple fields exists with this name, this method returns the + * first value added. + * Do not use this method with lazy loaded fields or {@link NumericField}. + * @deprecated use {@link #getIndexableField} instead and cast depending on + * data type. + * @throws ClassCastException if you try to retrieve a numerical or + * lazy loaded field. + @Deprecated + public final Field getField(String name) { + return (Field) getIndexableField(name); + } + */ + + + /** Returns a field with the given name if any exist in this document, or + * null. If multiple fields exists with this name, this method returns the + * first value added. + public IndexableField getIndexableField(String name) { + for (IndexableField field : fields) { + if (field.name().equals(name)) + return field; + } + return null; + } + */ + + /** Returns the string value of the field with the given name if any exist in + * this document, or null. If multiple fields exist with this name, this + * method returns the first value added. If only binary fields with this name + * exist, returns null. + * For {@link NumericField} it returns the string value of the number. If you want + * the actual {@code NumericField} instance back, use {@link #getIndexableField}. + */ + public final String get(String name) { + for (IndexableField field : fields) { + if (field.name().equals(name) && !((Field) field).isBinary()) + return field.stringValue(); + } + return null; + } + + /** Returns a List of all the fields in a document. + *

Note that fields which are not {@link IndexableField#isStored() stored} are + * not available in documents retrieved from the + * index, e.g. {@link IndexSearcher#doc(int)} or {@link + * IndexReader#document(int)}. + public final List getFields() { + return fields; + } + */ + + private final static Field[] NO_FIELDS = new Field[0]; + + /** + * Returns an array of {@link Field}s with the given name. + * This method returns an empty array when there are no + * matching fields. It never returns null. + * Do not use this method with lazy loaded fields or {@link NumericField}. + * + * @param name the name of the field + * @return a Field[] array + * @deprecated use {@link #getIndexableField} instead and cast depending on + * data type. + * @throws ClassCastException if you try to retrieve a numerical or + * lazy loaded field. + @Deprecated + public final Field[] getFields(String name) { + List result = new ArrayList(); + for (IndexableField field : fields) { + if (field.name().equals(name)) { + result.add((Field) field); + } + } + + if (result.size() == 0) + return NO_FIELDS; + + return result.toArray(new Field[result.size()]); + } + */ + + + private final static IndexableField[] NO_INDEXABLEFIELDS = new IndexableField[0]; + + /** + * Returns an array of {@link IndexableField}s with the given name. + * This method returns an empty array when there are no + * matching fields. It never returns null. + * + * @param name the name of the field + * @return a IndexableField[] array + public IndexableField[] getIndexableFields(String name) { + List result = new ArrayList(); + for (IndexableField field : fields) { + if (field.name().equals(name)) { + result.add(field); + } + } + + if (result.size() == 0) + return NO_INDEXABLEFIELDS; + + return result.toArray(new IndexableField[result.size()]); + } + */ + + + private final static String[] NO_STRINGS = new String[0]; + + /** + * Returns an array of values of the field specified as the method parameter. + * This method returns an empty array when there are no + * matching fields. It never returns null. + * For {@link NumericField}s it returns the string value of the number. If you want + * the actual {@code NumericField} instances back, use {@link #getIndexableFields}. + * @param name the name of the field + * @return a String[] of field values + public final String[] getValues(String name) { + List result = new ArrayList(); + for (IndexableField field : fields) { + if (field.name().equals(name) && !((Field) field).isBinary()) + result.add(field.stringValue()); + } + + if (result.size() == 0) + return NO_STRINGS; + + return result.toArray(new String[result.size()]); + } + */ + + private final static byte[][] NO_BYTES = new byte[0][]; + + /** + * Returns an array of byte arrays for of the fields that have the name specified + * as the method parameter. This method returns an empty + * array when there are no matching fields. It never + * returns null. + * + * @param name the name of the field + * @return a byte[][] of binary field values + public final byte[][] getBinaryValues(String name) { + List result = new ArrayList(); + for (IndexableField field : fields) { + if (field.name().equals(name) && ((Field) field).isBinary()) + result.add(field.binaryValue(null).bytes); + } + + if (result.size() == 0) + return NO_BYTES; + + return result.toArray(new byte[result.size()][]); + } + */ + + /** + * Returns an array of bytes for the first (or only) field that has the name + * specified as the method parameter. This method will return null + * if no binary fields with the specified name are available. + * There may be non-binary fields with the same name. + * + * @param name the name of the field. + * @return a byte[] containing the binary field value or null + public final byte[] getBinaryValue(String name) { + for (IndexableField field : fields) { + if (field.name().equals(name) && ((Field) field).isBinary()) + return field.binaryValue(null).bytes; + } + return null; + } + */ + + /** Prints the fields of a document for human consumption. */ + @Override + public final String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append("Document<"); + for (int i = 0; i < fields.size(); i++) { + IndexableField field = fields.get(i); + buffer.append(field.toString()); + if (i != fields.size()-1) + buffer.append(" "); + } + buffer.append(">"); + return buffer.toString(); + } +} Index: lucene/src/java/org/apache/lucene/document2/StringField.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/StringField.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/StringField.java (revision 0) @@ -0,0 +1,42 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public final class StringField extends Field { + + public static final FieldType DEFAULT_TYPE = new FieldType() + .setIndexed(true) + .setOmitNorms(true) + .setOmitTermFreqAndPositions(true); + + public StringField(String name, boolean internName, String value) { + super(name, StringField.DEFAULT_TYPE, value); + } + + public StringField(String name, String value) { + this(name, true, value); + } + + public String stringValue() { + return (fieldsData == null) ? null : fieldsData.toString(); + } + + public boolean isNumeric() { + return false; + } +} Index: lucene/src/java/org/apache/lucene/document2/DateTools.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/DateTools.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/DateTools.java (revision 0) @@ -0,0 +1,210 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.NumericRangeQuery; // for javadocs +import org.apache.lucene.util.NumericUtils; // for javadocs + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; + +/** + * Provides support for converting dates to strings and vice-versa. + * The strings are structured so that lexicographic sorting orders + * them by date, which makes them suitable for use as field values + * and search terms. + * + *

This class also helps you to limit the resolution of your dates. Do not + * save dates with a finer resolution than you really need, as then + * RangeQuery and PrefixQuery will require more memory and become slower. + * + *

+ * Another approach is {@link NumericUtils}, which provides + * a sortable binary representation (prefix encoded) of numeric values, which + * date/time are. + * For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as + * long using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and + * index this as a numeric value with {@link NumericField} + * and use {@link NumericRangeQuery} to query it. + */ +public class DateTools { + + final static TimeZone GMT = TimeZone.getTimeZone("GMT"); + + private static final ThreadLocal TL_CAL = new ThreadLocal() { + @Override + protected Calendar initialValue() { + return Calendar.getInstance(GMT, Locale.US); + } + }; + + //indexed by format length + private static final ThreadLocal TL_FORMATS = new ThreadLocal() { + @Override + protected SimpleDateFormat[] initialValue() { + SimpleDateFormat[] arr = new SimpleDateFormat[Resolution.MILLISECOND.formatLen+1]; + for (Resolution resolution : Resolution.values()) { + arr[resolution.formatLen] = (SimpleDateFormat)resolution.format.clone(); + } + return arr; + } + }; + + // cannot create, the class has static methods only + private DateTools() {} + + /** + * Converts a Date to a string suitable for indexing. + * + * @param date the date to be converted + * @param resolution the desired resolution, see + * {@link #round(Date, DateTools.Resolution)} + * @return a string in format yyyyMMddHHmmssSSS or shorter, + * depending on resolution; using GMT as timezone + */ + public static String dateToString(Date date, Resolution resolution) { + return timeToString(date.getTime(), resolution); + } + + /** + * Converts a millisecond time to a string suitable for indexing. + * + * @param time the date expressed as milliseconds since January 1, 1970, 00:00:00 GMT + * @param resolution the desired resolution, see + * {@link #round(long, DateTools.Resolution)} + * @return a string in format yyyyMMddHHmmssSSS or shorter, + * depending on resolution; using GMT as timezone + */ + public static String timeToString(long time, Resolution resolution) { + final Date date = new Date(round(time, resolution)); + return TL_FORMATS.get()[resolution.formatLen].format(date); + } + + /** + * Converts a string produced by timeToString or + * dateToString back to a time, represented as the + * number of milliseconds since January 1, 1970, 00:00:00 GMT. + * + * @param dateString the date string to be converted + * @return the number of milliseconds since January 1, 1970, 00:00:00 GMT + * @throws ParseException if dateString is not in the + * expected format + */ + public static long stringToTime(String dateString) throws ParseException { + return stringToDate(dateString).getTime(); + } + + /** + * Converts a string produced by timeToString or + * dateToString back to a time, represented as a + * Date object. + * + * @param dateString the date string to be converted + * @return the parsed time as a Date object + * @throws ParseException if dateString is not in the + * expected format + */ + public static Date stringToDate(String dateString) throws ParseException { + try { + return TL_FORMATS.get()[dateString.length()].parse(dateString); + } catch (Exception e) { + throw new ParseException("Input is not a valid date string: " + dateString, 0); + } + } + + /** + * Limit a date's resolution. For example, the date 2004-09-21 13:50:11 + * will be changed to 2004-09-01 00:00:00 when using + * Resolution.MONTH. + * + * @param resolution The desired resolution of the date to be returned + * @return the date with all values more precise than resolution + * set to 0 or 1 + */ + public static Date round(Date date, Resolution resolution) { + return new Date(round(date.getTime(), resolution)); + } + + /** + * Limit a date's resolution. For example, the date 1095767411000 + * (which represents 2004-09-21 13:50:11) will be changed to + * 1093989600000 (2004-09-01 00:00:00) when using + * Resolution.MONTH. + * + * @param resolution The desired resolution of the date to be returned + * @return the date with all values more precise than resolution + * set to 0 or 1, expressed as milliseconds since January 1, 1970, 00:00:00 GMT + */ + @SuppressWarnings("fallthrough") + public static long round(long time, Resolution resolution) { + final Calendar calInstance = TL_CAL.get(); + calInstance.setTimeInMillis(time); + + switch (resolution) { + //NOTE: switch statement fall-through is deliberate + case YEAR: + calInstance.set(Calendar.MONTH, 0); + case MONTH: + calInstance.set(Calendar.DAY_OF_MONTH, 1); + case DAY: + calInstance.set(Calendar.HOUR_OF_DAY, 0); + case HOUR: + calInstance.set(Calendar.MINUTE, 0); + case MINUTE: + calInstance.set(Calendar.SECOND, 0); + case SECOND: + calInstance.set(Calendar.MILLISECOND, 0); + case MILLISECOND: + // don't cut off anything + break; + default: + throw new IllegalArgumentException("unknown resolution " + resolution); + } + return calInstance.getTimeInMillis(); + } + + /** Specifies the time granularity. */ + public static enum Resolution { + + YEAR(4), MONTH(6), DAY(8), HOUR(10), MINUTE(12), SECOND(14), MILLISECOND(17); + + final int formatLen; + final SimpleDateFormat format;//should be cloned before use, since it's not threadsafe + + Resolution(int formatLen) { + this.formatLen = formatLen; + // formatLen 10's place: 11111111 + // formatLen 1's place: 12345678901234567 + this.format = new SimpleDateFormat("yyyyMMddHHmmssSSS".substring(0,formatLen),Locale.US); + this.format.setTimeZone(GMT); + } + + /** this method returns the name of the resolution + * in lowercase (for backwards compatibility) */ + @Override + public String toString() { + return super.toString().toLowerCase(Locale.ENGLISH); + } + + } + +} Index: lucene/src/java/org/apache/lucene/document2/FieldSelector.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/FieldSelector.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/FieldSelector.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.document2; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about + * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)} + * + **/ +public interface FieldSelector { + + /** + * + * @param fieldName the field to accept or reject + * @return an instance of {@link FieldSelectorResult} + * if the {@link Field} named fieldName should be loaded. + */ + FieldSelectorResult accept(String fieldName); +} Index: lucene/src/java/org/apache/lucene/document2/CompressionTools.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/CompressionTools.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/CompressionTools.java (revision 0) @@ -0,0 +1,127 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.zip.Deflater; +import java.util.zip.Inflater; +import java.util.zip.DataFormatException; +import java.io.ByteArrayOutputStream; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; + +/** Simple utility class providing static methods to + * compress and decompress binary data for stored fields. + * This class uses java.util.zip.Deflater and Inflater + * classes to compress and decompress. + */ + +public class CompressionTools { + + // Export only static methods + private CompressionTools() {} + + /** Compresses the specified byte range using the + * specified compressionLevel (constants are defined in + * java.util.zip.Deflater). */ + public static byte[] compress(byte[] value, int offset, int length, int compressionLevel) { + + /* Create an expandable byte array to hold the compressed data. + * You cannot use an array that's the same size as the orginal because + * there is no guarantee that the compressed data will be smaller than + * the uncompressed data. */ + ByteArrayOutputStream bos = new ByteArrayOutputStream(length); + + Deflater compressor = new Deflater(); + + try { + compressor.setLevel(compressionLevel); + compressor.setInput(value, offset, length); + compressor.finish(); + + // Compress the data + final byte[] buf = new byte[1024]; + while (!compressor.finished()) { + int count = compressor.deflate(buf); + bos.write(buf, 0, count); + } + } finally { + compressor.end(); + } + + return bos.toByteArray(); + } + + /** Compresses the specified byte range, with default BEST_COMPRESSION level */ + public static byte[] compress(byte[] value, int offset, int length) { + return compress(value, offset, length, Deflater.BEST_COMPRESSION); + } + + /** Compresses all bytes in the array, with default BEST_COMPRESSION level */ + public static byte[] compress(byte[] value) { + return compress(value, 0, value.length, Deflater.BEST_COMPRESSION); + } + + /** Compresses the String value, with default BEST_COMPRESSION level */ + public static byte[] compressString(String value) { + return compressString(value, Deflater.BEST_COMPRESSION); + } + + /** Compresses the String value using the specified + * compressionLevel (constants are defined in + * java.util.zip.Deflater). */ + public static byte[] compressString(String value, int compressionLevel) { + BytesRef result = new BytesRef(); + UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result); + return compress(result.bytes, 0, result.length, compressionLevel); + } + + /** Decompress the byte array previously returned by + * compress */ + public static byte[] decompress(byte[] value) throws DataFormatException { + // Create an expandable byte array to hold the decompressed data + ByteArrayOutputStream bos = new ByteArrayOutputStream(value.length); + + Inflater decompressor = new Inflater(); + + try { + decompressor.setInput(value); + + // Decompress the data + final byte[] buf = new byte[1024]; + while (!decompressor.finished()) { + int count = decompressor.inflate(buf); + bos.write(buf, 0, count); + } + } finally { + decompressor.end(); + } + + return bos.toByteArray(); + } + + /** Decompress the byte array previously returned by + * compressString back into a String */ + public static String decompressString(byte[] value) throws DataFormatException { + final byte[] bytes = decompress(value); + CharsRef result = new CharsRef(bytes.length); + UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result); + return new String(result.chars, 0, result.length); + } +} Index: lucene/src/java/org/apache/lucene/document2/BinaryField.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/BinaryField.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/BinaryField.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.document2; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public final class BinaryField extends Field { + + public static final FieldType DEFAULT_TYPE = new FieldType() + .setStored(true); + + public BinaryField(String name, byte[] value) { + super(name, BinaryField.DEFAULT_TYPE, value); + this.isBinary = true; + } + + public boolean isNumeric() { + return false; + } +} Index: lucene/src/java/org/apache/lucene/document2/LoadFirstFieldSelector.java =================================================================== --- lucene/src/java/org/apache/lucene/document2/LoadFirstFieldSelector.java (revision 0) +++ lucene/src/java/org/apache/lucene/document2/LoadFirstFieldSelector.java (revision 0) @@ -0,0 +1,29 @@ +package org.apache.lucene.document2; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Load the First field and break. + *

+ * See {@link FieldSelectorResult#LOAD_AND_BREAK} + */ +public class LoadFirstFieldSelector implements FieldSelector { + + public FieldSelectorResult accept(String fieldName) { + return FieldSelectorResult.LOAD_AND_BREAK; + } +} \ No newline at end of file