diff -ruN -x .svn -x build trunk_2/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java docvalues/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java --- trunk_2/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java 2011-02-01 17:08:48.000000000 +0100 +++ docvalues/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java 2011-05-17 16:46:19.000000000 +0200 @@ -32,7 +32,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.*; -import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.BytesRef; @@ -487,4 +487,9 @@ } } } + + @Override + public PerDocValues perDocValues() throws IOException { + return null; + } } diff -ruN -x .svn -x build trunk_2/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java docvalues/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java --- trunk_2/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java 2011-05-28 09:04:37.000000000 +0200 +++ docvalues/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java 2011-05-19 22:32:12.000000000 +0200 @@ -52,6 +52,7 @@ import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -1278,6 +1279,11 @@ return Collections.unmodifiableSet(fields.keySet()); } + + @Override + public PerDocValues perDocValues() throws IOException { + return null; + } } diff -ruN -x .svn -x build trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java --- trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java 2011-02-21 00:17:24.000000000 +0100 +++ docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java 2011-05-17 16:46:19.000000000 +0200 @@ -20,13 +20,19 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; @@ -127,15 +133,28 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + final String codecIdAsString = "" + codecId; + StandardPostingsReader.files(dir, segmentInfo, codecIdAsString, files); + BlockTermsReader.files(dir, segmentInfo, codecIdAsString, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); } @Override public void getExtensions(Set extensions) { StandardCodec.getStandardExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); } } diff -ruN -x .svn -x build trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java --- trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java 2011-02-21 00:17:24.000000000 +0100 +++ docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java 2011-05-17 16:46:19.000000000 +0200 @@ -33,7 +33,7 @@ public AppendingTermsDictReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize, - int termsCacheSize, String codecId) throws IOException { + int termsCacheSize, int codecId) throws IOException { super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize, termsCacheSize, codecId); } diff -ruN -x .svn -x build trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java --- trunk_2/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java 2010-11-15 20:34:13.000000000 +0100 +++ docvalues/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java 2011-05-17 16:46:19.000000000 +0200 @@ -30,7 +30,7 @@ public class AppendingTermsIndexReader extends FixedGapTermsIndexReader { public AppendingTermsIndexReader(Directory dir, FieldInfos fieldInfos, - String segment, int indexDivisor, Comparator termComp, String codecId) + String segment, int indexDivisor, Comparator termComp, int codecId) throws IOException { super(dir, fieldInfos, segment, indexDivisor, termComp, codecId); } diff -ruN -x .svn -x build trunk_2/lucene/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java docvalues/lucene/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java --- trunk_2/lucene/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java 2011-05-28 09:04:38.000000000 +0200 +++ docvalues/lucene/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java 2011-05-17 16:46:19.000000000 +0200 @@ -21,7 +21,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -33,7 +33,7 @@ File destDir = new File(TEMP_DIR, "testfilesplitterdest"); _TestUtil.rmDir(destDir); destDir.mkdirs(); - Directory fsDir = newFSDirectory(dir); + FSDirectory fsDir = FSDirectory.open(dir); LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setNoCFSRatio(1); @@ -58,19 +58,14 @@ iw.addDocument(doc); } iw.commit(); - IndexReader iwReader = iw.getReader(); - assertEquals(3, iwReader.getSequentialSubReaders().length); - iwReader.close(); + assertEquals(3, iw.getReader().getSequentialSubReaders().length); iw.close(); // we should have 2 segments now IndexSplitter is = new IndexSplitter(dir); String splitSegName = is.infos.info(1).name; is.split(destDir, new String[] {splitSegName}); - Directory fsDirDest = newFSDirectory(destDir); - IndexReader r = IndexReader.open(fsDirDest, true); + IndexReader r = IndexReader.open(FSDirectory.open(destDir), true); assertEquals(50, r.maxDoc()); - r.close(); - fsDirDest.close(); // now test cmdline File destDir2 = new File(TEMP_DIR, "testfilesplitterdest2"); @@ -78,17 +73,12 @@ destDir2.mkdirs(); IndexSplitter.main(new String[] {dir.getAbsolutePath(), destDir2.getAbsolutePath(), splitSegName}); assertEquals(3, destDir2.listFiles().length); - Directory fsDirDest2 = newFSDirectory(destDir2); - r = IndexReader.open(fsDirDest2, true); + r = IndexReader.open(FSDirectory.open(destDir2), true); assertEquals(50, r.maxDoc()); - r.close(); - fsDirDest2.close(); // now remove the copied segment from src IndexSplitter.main(new String[] {dir.getAbsolutePath(), "-d", splitSegName}); - r = IndexReader.open(fsDir, true); + r = IndexReader.open(FSDirectory.open(dir), true); assertEquals(2, r.getSequentialSubReaders().length); - r.close(); - fsDir.close(); } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/document/AbstractField.java docvalues/lucene/src/java/org/apache/lucene/document/AbstractField.java --- trunk_2/lucene/src/java/org/apache/lucene/document/AbstractField.java 2011-02-21 00:17:09.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/document/AbstractField.java 2011-05-19 22:32:18.000000000 +0200 @@ -19,6 +19,8 @@ import org.apache.lucene.search.spans.SpanQuery; // for javadocs import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.values.PerDocFieldValues; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.util.StringHelper; // for javadocs @@ -47,6 +49,8 @@ // length/offset for all primitive types protected int binaryLength; protected int binaryOffset; + protected PerDocFieldValues docValues; + protected AbstractField() { @@ -289,4 +293,20 @@ result.append('>'); return result.toString(); } + + public PerDocFieldValues getDocValues() { + return docValues; + } + + public void setDocValues(PerDocFieldValues docValues) { + this.docValues = docValues; + } + + public boolean hasDocValues() { + return docValues != null && docValues.type() != null; + } + + public ValueType docValuesType() { + return docValues == null? null : docValues.type(); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/document/Fieldable.java docvalues/lucene/src/java/org/apache/lucene/document/Fieldable.java --- trunk_2/lucene/src/java/org/apache/lucene/document/Fieldable.java 2011-02-21 00:17:09.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/document/Fieldable.java 2011-06-03 22:43:04.000000000 +0200 @@ -18,6 +18,9 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.FieldInvertState; // for javadocs +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.PerDocFieldValues; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.search.PhraseQuery; // for javadocs import org.apache.lucene.search.spans.SpanQuery; // for javadocs @@ -206,4 +209,29 @@ * fail with an exception. */ void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + + /** + * Returns the {@link PerDocFieldValues} + */ + public PerDocFieldValues getDocValues(); + + /** + * Sets the {@link PerDocFieldValues} for this field. If + * {@link PerDocFieldValues} is set this field will store per-document values + * + * @see IndexDocValues + */ + public void setDocValues(PerDocFieldValues docValues); + + /** + * Returns true iff {@link PerDocFieldValues} are set on this + * field. + */ + public boolean hasDocValues(); + + /** + * Returns the {@link ValueType} of the set {@link PerDocFieldValues} or + * null if not set. + */ + public ValueType docValuesType(); } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java docvalues/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java --- trunk_2/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java 2011-06-03 22:43:04.000000000 +0200 @@ -0,0 +1,286 @@ +package org.apache.lucene.document; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Reader; +import java.util.Comparator; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.values.PerDocFieldValues; +import org.apache.lucene.index.values.ValueType; +import org.apache.lucene.util.BytesRef; + +/** + *

+ * This class provides a {@link AbstractField} that enables storing of typed + * per-document values for scoring, sorting or value retrieval. Here's an + * example usage, adding an int value: + * + *

+ * document.add(new IndexDocValuesField(name).setInt(value));
+ * 
+ * + * For optimal performance, re-use the DocValuesField and + * {@link Document} instance for more than one document: + * + *
+ *  IndexDocValuesField field = new IndexDocValuesField(name);
+ *  Document document = new Document();
+ *  document.add(field);
+ * 
+ *  for(all documents) {
+ *    ...
+ *    field.setInt(value)
+ *    writer.addDocument(document);
+ *    ...
+ *  }
+ * 
+ * + *

+ * If doc values are stored in addition to an indexed ({@link Index}) or stored + * ({@link Store}) value it's recommended to use the {@link IndexDocValuesField}'s + * {@link #set(AbstractField)} API: + * + *

+ *  IndexDocValuesField field = new IndexDocValuesField(name);
+ *  Field indexedField = new Field(name, stringValue, Stored.NO, Indexed.ANALYZED);
+ *  Document document = new Document();
+ *  document.add(indexedField);
+ *  field.set(indexedField);
+ *  for(all documents) {
+ *    ...
+ *    field.setInt(value)
+ *    writer.addDocument(document);
+ *    ...
+ *  }
+ * 
+ * + * */ +public class IndexDocValuesField extends AbstractField implements PerDocFieldValues { + + protected BytesRef bytes; + protected double doubleValue; + protected long longValue; + protected ValueType type; + protected Comparator bytesComparator; + + /** + * Creates a new {@link IndexDocValuesField} with the given name. + */ + public IndexDocValuesField(String name) { + super(name, Store.NO, Index.NO, TermVector.NO); + setDocValues(this); + } + + /** + * Creates a {@link IndexDocValuesField} prototype + */ + IndexDocValuesField() { + this(""); + } + + /** + * Sets the given long value and sets the field's {@link ValueType} to + * {@link ValueType#INTS} unless already set. If you want to change the + * default type use {@link #setType(ValueType)}. + */ + public void setInt(long value) { + if (type == null) { + type = ValueType.INTS; + } + longValue = value; + } + + /** + * Sets the given float value and sets the field's {@link ValueType} + * to {@link ValueType#FLOAT_32} unless already set. If you want to + * change the type use {@link #setType(ValueType)}. + */ + public void setFloat(float value) { + if (type == null) { + type = ValueType.FLOAT_32; + } + doubleValue = value; + } + + /** + * Sets the given double value and sets the field's {@link ValueType} + * to {@link ValueType#FLOAT_64} unless already set. If you want to + * change the default type use {@link #setType(ValueType)}. + */ + public void setFloat(double value) { + if (type == null) { + type = ValueType.FLOAT_64; + } + doubleValue = value; + } + + /** + * Sets the given {@link BytesRef} value and the field's {@link ValueType}. The + * comparator for this field is set to null. If a + * null comparator is set the default comparator for the given + * {@link ValueType} is used. + */ + public void setBytes(BytesRef value, ValueType type) { + setBytes(value, type, null); + } + + /** + * Sets the given {@link BytesRef} value, the field's {@link ValueType} and the + * field's comparator. If the {@link Comparator} is set to null + * the default for the given {@link ValueType} is used instead. + * + * @throws IllegalArgumentException + * if the value or the type are null + */ + public void setBytes(BytesRef value, ValueType type, Comparator comp) { + if (value == null) { + throw new IllegalArgumentException("value must not be null"); + } + setType(type); + if (bytes == null) { + bytes = new BytesRef(value); + } else { + bytes.copy(value); + } + bytesComparator = comp; + } + + /** + * Returns the set {@link BytesRef} or null if not set. + */ + public BytesRef getBytes() { + return bytes; + } + + /** + * Returns the set {@link BytesRef} comparator or null if not set + */ + public Comparator bytesComparator() { + return bytesComparator; + } + + /** + * Returns the set floating point value or 0.0d if not set. + */ + public double getFloat() { + return doubleValue; + } + + /** + * Returns the set long value of 0 if not set. + */ + public long getInt() { + return longValue; + } + + /** + * Sets the {@link BytesRef} comparator for this field. If the field has a + * numeric {@link ValueType} the comparator will be ignored. + */ + public void setBytesComparator(Comparator comp) { + this.bytesComparator = comp; + } + + /** + * Sets the {@link ValueType} for this field. + */ + public void setType(ValueType type) { + if (type == null) { + throw new IllegalArgumentException("Type must not be null"); + } + this.type = type; + } + + /** + * Returns the field's {@link ValueType} + */ + public ValueType type() { + return type; + } + + /** + * Returns always null + */ + public Reader readerValue() { + return null; + } + + /** + * Returns always null + */ + public String stringValue() { + return null; + } + + /** + * Returns always null + */ + public TokenStream tokenStreamValue() { + return null; + } + + /** + * Sets this {@link IndexDocValuesField} to the given {@link AbstractField} and + * returns the given field. Any modifications to this instance will be visible + * to the given field. + */ + public T set(T field) { + field.setDocValues(this); + return field; + } + + /** + * Sets a new {@link PerDocFieldValues} instance on the given field with the + * given type and returns it. + * + */ + public static T set(T field, ValueType type) { + if (field instanceof IndexDocValuesField) + return field; + final IndexDocValuesField valField = new IndexDocValuesField(); + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + BytesRef ref = field.isBinary() ? new BytesRef(field.getBinaryValue(), + field.getBinaryOffset(), field.getBinaryLength()) : new BytesRef( + field.stringValue()); + valField.setBytes(ref, type); + break; + case INTS: + valField.setInt(Long.parseLong(field.stringValue())); + break; + case FLOAT_32: + valField.setFloat(Float.parseFloat(field.stringValue())); + break; + case FLOAT_64: + valField.setFloat(Double.parseDouble(field.stringValue())); + break; + default: + throw new IllegalArgumentException("unknown type: " + type); + } + return valField.set(field); + } + +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/CheckIndex.java docvalues/lucene/src/java/org/apache/lucene/index/CheckIndex.java --- trunk_2/lucene/src/java/org/apache/lucene/index/CheckIndex.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/CheckIndex.java 2011-06-03 22:42:59.000000000 +0200 @@ -27,6 +27,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.ValuesEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -195,6 +198,9 @@ /** Status for testing of term vectors (null if term vectors could not be tested). */ public TermVectorStatus termVectorStatus; + + /** Status for testing of DocValues (null if DocValues could not be tested). */ + public DocValuesStatus docValuesStatus; } /** @@ -254,6 +260,15 @@ /** Exception thrown during term vector test (null on success) */ public Throwable error = null; } + + public static final class DocValuesStatus { + /** Number of documents tested. */ + public int docCount; + /** Total number of docValues tested. */ + public long totalValueFields; + /** Exception thrown during doc values test (null on success) */ + public Throwable error = null; + } } /** Create a new CheckIndex on the directory. */ @@ -499,6 +514,8 @@ // Test Term Vectors segInfoStat.termVectorStatus = testTermVectors(info, reader, nf); + + segInfoStat.docValuesStatus = testDocValues(info, reader); // Rethrow the first exception we encountered // This will cause stats for failed segments to be incremented properly @@ -510,6 +527,8 @@ throw new RuntimeException("Stored Field test failed"); } else if (segInfoStat.termVectorStatus.error != null) { throw new RuntimeException("Term Vector test failed"); + } else if (segInfoStat.docValuesStatus.error != null) { + throw new RuntimeException("DocValues test failed"); } msg(""); @@ -920,6 +939,60 @@ return status; } + + private Status.DocValuesStatus testDocValues(SegmentInfo info, + SegmentReader reader) { + final Status.DocValuesStatus status = new Status.DocValuesStatus(); + try { + if (infoStream != null) { + infoStream.print(" test: DocValues........"); + } + final FieldInfos fieldInfos = info.getFieldInfos(); + for (FieldInfo fieldInfo : fieldInfos) { + if (fieldInfo.hasDocValues()) { + status.totalValueFields++; + final PerDocValues perDocValues = reader.perDocValues(); + final IndexDocValues docValues = perDocValues.docValues(fieldInfo.name); + if (docValues == null) { + continue; + } + final ValuesEnum values = docValues.getEnum(); + while (values.nextDoc() != ValuesEnum.NO_MORE_DOCS) { + switch (fieldInfo.docValues) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + values.bytes(); + break; + case FLOAT_32: + case FLOAT_64: + values.getFloat(); + break; + case INTS: + values.getInt(); + break; + default: + throw new IllegalArgumentException("Field: " + fieldInfo.name + + " - no such DocValues type: " + fieldInfo.docValues); + } + } + } + } + + msg("OK [" + status.docCount + " total doc Count; Num DocValues Fields " + + status.totalValueFields); + } catch (Throwable e) { + msg("ERROR [" + String.valueOf(e.getMessage()) + "]"); + status.error = e; + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + return status; + } /** * Test term vectors for a segment. diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/CompoundFileReader.java docvalues/lucene/src/java/org/apache/lucene/index/CompoundFileReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/CompoundFileReader.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/CompoundFileReader.java 2011-06-03 22:42:58.000000000 +0200 @@ -158,7 +158,7 @@ throw new IOException("Stream closed"); id = IndexFileNames.stripSegmentName(id); - FileEntry entry = entries.get(id); + final FileEntry entry = entries.get(id); if (entry == null) throw new IOException("No sub-file with id " + id + " found (files: " + entries.keySet() + ")"); diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/DirectoryReader.java docvalues/lucene/src/java/org/apache/lucene/index/DirectoryReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/DirectoryReader.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/DirectoryReader.java 2011-06-03 22:42:59.000000000 +0200 @@ -35,6 +35,7 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.MapBackedSet; @@ -951,8 +952,8 @@ Collections.sort(commits); return commits; - } - + } + private static final class ReaderCommit extends IndexCommit { private String segmentsFileName; Collection files; @@ -1022,4 +1023,9 @@ throw new UnsupportedOperationException("This IndexCommit does not support deletions"); } } + + @Override + public PerDocValues perDocValues() throws IOException { + throw new UnsupportedOperationException("please use MultiPerDocValues#getPerDocs, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level Fields"); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java docvalues/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java --- trunk_2/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java 2011-06-03 22:42:59.000000000 +0200 @@ -27,6 +27,10 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.DocumentsWriterPerThread.DocState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DocValuesConsumer; import org.apache.lucene.util.ArrayUtil; @@ -80,6 +84,9 @@ // FieldInfo.storePayload. final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELD_INFOS_EXTENSION); state.fieldInfos.write(state.directory, fileName); + for (DocValuesConsumer consumers : docValues.values()) { + consumers.finish(state.numDocs); + }; } @Override @@ -100,6 +107,14 @@ } } + for(PerDocConsumer consumer : perDocConsumers.values()) { + try { + consumer.close(); // TODO add abort to PerDocConsumer! + } catch (IOException e) { + // ignore on abort! + } + } + try { fieldsWriter.abort(); } catch (Throwable t) { @@ -150,6 +165,15 @@ fieldHash = new DocFieldProcessorPerField[2]; hashMask = 1; totalFieldCount = 0; + for(PerDocConsumer consumer : perDocConsumers.values()) { + try { + consumer.close(); + } catch (IOException e) { + // ignore and continue closing remaining consumers + } + } + perDocConsumers.clear(); + docValues.clear(); } private void rehash() { @@ -215,7 +239,7 @@ // easily add it FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; @@ -227,7 +251,7 @@ } else { fieldInfos.addOrUpdate(fp.fieldInfo.name, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); } if (thisFieldGen != fp.lastGen) { @@ -251,6 +275,10 @@ if (field.isStored()) { fieldsWriter.addField(field, fp.fieldInfo); } + if (field.hasDocValues()) { + final DocValuesConsumer docValuesConsumer = docValuesConsumer(docState, fp.fieldInfo); + docValuesConsumer.add(docState.docID, field.getDocValues()); + } } // If we are writing vectors then we must visit @@ -286,4 +314,36 @@ } } + final private Map docValues = new HashMap(); + final private Map perDocConsumers = new HashMap(); + + DocValuesConsumer docValuesConsumer(DocState docState, FieldInfo fieldInfo) + throws IOException { + DocValuesConsumer docValuesConsumer = docValues.get(fieldInfo.name); + if (docValuesConsumer != null) { + return docValuesConsumer; + } + PerDocConsumer perDocConsumer = perDocConsumers.get(fieldInfo.getCodecId()); + if (perDocConsumer == null) { + PerDocWriteState perDocWriteState = docState.docWriter.newPerDocWriteState(fieldInfo.getCodecId()); + SegmentCodecs codecs = perDocWriteState.segmentCodecs; + assert codecs.codecs.length > fieldInfo.getCodecId(); + Codec codec = codecs.codecs[fieldInfo.getCodecId()]; + perDocConsumer = codec.docsConsumer(perDocWriteState); + perDocConsumers.put(Integer.valueOf(fieldInfo.getCodecId()), perDocConsumer); + } + boolean success = false; + try { + docValuesConsumer = perDocConsumer.addValuesField(fieldInfo); + fieldInfo.commitDocValues(); + success = true; + } finally { + if (!success) { + fieldInfo.revertUncommitted(); + } + } + docValues.put(fieldInfo.name, docValuesConsumer); + return docValuesConsumer; + } + } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java docvalues/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java --- trunk_2/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java 2011-06-03 22:42:58.000000000 +0200 @@ -32,6 +32,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; import org.apache.lucene.util.RamUsageEstimator; public class DocumentsWriterPerThread { @@ -169,6 +170,7 @@ DocumentsWriterDeleteQueue deleteQueue; DeleteSlice deleteSlice; private final NumberFormat nf = NumberFormat.getInstance(); + final Allocator byteBlockAllocator; public DocumentsWriterPerThread(Directory directory, DocumentsWriter parent, @@ -181,9 +183,9 @@ this.docState = new DocState(this); this.docState.similarityProvider = parent.indexWriter.getConfig() .getSimilarityProvider(); - - consumer = indexingChain.getChain(this); bytesUsed = new AtomicLong(0); + byteBlockAllocator = new DirectTrackingAllocator(bytesUsed); + consumer = indexingChain.getChain(this); pendingDeletes = new BufferedDeletes(false); initialize(); } @@ -538,36 +540,13 @@ bytesUsed.addAndGet(-(length *(INT_BLOCK_SIZE*RamUsageEstimator.NUM_BYTES_INT))); } - final Allocator byteBlockAllocator = new DirectTrackingAllocator(); - - - private class DirectTrackingAllocator extends Allocator { - public DirectTrackingAllocator() { - this(BYTE_BLOCK_SIZE); - } - - public DirectTrackingAllocator(int blockSize) { - super(blockSize); - } - - @Override - public byte[] getByteBlock() { - bytesUsed.addAndGet(blockSize); - return new byte[blockSize]; - } - @Override - public void recycleByteBlocks(byte[][] blocks, int start, int end) { - bytesUsed.addAndGet(-((end-start)* blockSize)); - for (int i = start; i < end; i++) { - blocks[i] = null; - } - } - + PerDocWriteState newPerDocWriteState(int codecId) { + assert segment != null; + return new PerDocWriteState(infoStream, directory, segment, fieldInfos, bytesUsed, codecId); } void setInfoStream(PrintStream infoStream) { this.infoStream = infoStream; docState.infoStream = infoStream; } - } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/FieldInfo.java docvalues/lucene/src/java/org/apache/lucene/index/FieldInfo.java --- trunk_2/lucene/src/java/org/apache/lucene/index/FieldInfo.java 2011-06-04 00:08:50.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/FieldInfo.java 2011-05-19 22:32:17.000000000 +0200 @@ -1,5 +1,7 @@ package org.apache.lucene.index; +import org.apache.lucene.index.values.ValueType; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -24,6 +26,8 @@ public final int number; public boolean isIndexed; + ValueType docValues; + // true if term vector for this field should be stored boolean storeTermVector; @@ -38,10 +42,11 @@ FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { name = na; isIndexed = tk; number = nu; + this.docValues = docValues; if (isIndexed) { this.storeTermVector = storeTermVector; this.storeOffsetWithTermVector = storeOffsetWithTermVector; @@ -72,7 +77,7 @@ @Override public Object clone() { FieldInfo clone = new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); clone.codecId = this.codecId; return clone; } @@ -107,8 +112,23 @@ } assert !this.omitTermFreqAndPositions || !this.storePayloads; } - private boolean vectorsCommitted; + void setDocValues(ValueType v) { + if (docValues == null) { + docValues = v; + } + } + + public boolean hasDocValues() { + return docValues != null; + } + public ValueType getDocValues() { + return docValues; + } + + private boolean vectorsCommitted; + private boolean docValuesCommitted; + /** * Reverts all uncommitted changes on this {@link FieldInfo} * @see #commitVectors() @@ -119,6 +139,10 @@ storePositionWithTermVector = false; storeTermVector = false; } + + if (docValues != null && !docValuesCommitted) { + docValues = null; + } } /** @@ -131,4 +155,9 @@ assert storeTermVector; vectorsCommitted = true; } + + void commitDocValues() { + assert hasDocValues(); + docValuesCommitted = true; + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/FieldInfos.java docvalues/lucene/src/java/org/apache/lucene/index/FieldInfos.java --- trunk_2/lucene/src/java/org/apache/lucene/index/FieldInfos.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/FieldInfos.java 2011-05-19 22:32:17.000000000 +0200 @@ -31,6 +31,7 @@ import org.apache.lucene.index.SegmentCodecs; // Required for Java 1.5 javadocs import org.apache.lucene.index.SegmentCodecs.SegmentCodecsBuilder; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -202,6 +203,9 @@ public static final int FORMAT_START = -2; public static final int FORMAT_PER_FIELD_CODEC = -3; + // Records index values for this field + public static final int FORMAT_INDEX_VALUES = -3; + // whenever you add a new format, make it 1 smaller (negative version logic)! static final int FORMAT_CURRENT = FORMAT_PER_FIELD_CODEC; @@ -410,7 +414,7 @@ synchronized public void addOrUpdate(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { addOrUpdate(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, false, false); + storeOffsetWithTermVector, omitNorms, false, false, null); } /** If the field is not yet known, adds it. If it is known, checks to make @@ -429,14 +433,14 @@ */ synchronized public FieldInfo addOrUpdate(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { return addOrUpdateInternal(name, -1, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); } synchronized private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed, - boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { if (globalFieldNumbers == null) { throw new IllegalStateException("FieldInfos are read-only, create a new instance with a global field map to make modifications to FieldInfos"); } @@ -444,11 +448,12 @@ FieldInfo fi = fieldInfo(name); if (fi == null) { final int fieldNumber = nextFieldNumber(name, preferredFieldNumber); - fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); } else { fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + fi.setDocValues(docValues); } - if (fi.isIndexed && fi.getCodecId() == FieldInfo.UNASSIGNED_CODEC_ID) { + if ((fi.isIndexed || fi.hasDocValues()) && fi.getCodecId() == FieldInfo.UNASSIGNED_CODEC_ID) { segmentCodecsBuilder.tryAddAndSet(fi); } version++; @@ -460,7 +465,7 @@ return addOrUpdateInternal(fi.name, fi.number, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, fi.omitNorms, fi.storePayloads, - fi.omitTermFreqAndPositions); + fi.omitTermFreqAndPositions, fi.docValues); } /* @@ -468,15 +473,14 @@ */ private FieldInfo addInternal(String name, int fieldNumber, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValuesType) { // don't check modifiable here since we use that to initially build up FIs name = StringHelper.intern(name); if (globalFieldNumbers != null) { globalFieldNumbers.setIfNotSet(fieldNumber, name); } final FieldInfo fi = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); - + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValuesType); putInternal(fi); return fi; } @@ -600,6 +604,45 @@ output.writeInt(fi.number); output.writeInt(fi.getCodecId()); output.writeByte(bits); + + final byte b; + + if (fi.docValues == null) { + b = 0; + } else { + switch(fi.docValues) { + case INTS: + b = 1; + break; + case FLOAT_32: + b = 2; + break; + case FLOAT_64: + b = 3; + break; + case BYTES_FIXED_STRAIGHT: + b = 4; + break; + case BYTES_FIXED_DEREF: + b = 5; + break; + case BYTES_FIXED_SORTED: + b = 6; + break; + case BYTES_VAR_STRAIGHT: + b = 7; + break; + case BYTES_VAR_DEREF: + b = 8; + break; + case BYTES_VAR_SORTED: + b = 9; + break; + default: + throw new IllegalStateException("unhandled indexValues type " + fi.docValues); + } + } + output.writeByte(b); } } @@ -637,7 +680,45 @@ } hasVectors |= storeTermVector; hasProx |= isIndexed && !omitTermFreqAndPositions; - final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + ValueType docValuesType = null; + if (format <= FORMAT_INDEX_VALUES) { + final byte b = input.readByte(); + switch(b) { + case 0: + docValuesType = null; + break; + case 1: + docValuesType = ValueType.INTS; + break; + case 2: + docValuesType = ValueType.FLOAT_32; + break; + case 3: + docValuesType = ValueType.FLOAT_64; + break; + case 4: + docValuesType = ValueType.BYTES_FIXED_STRAIGHT; + break; + case 5: + docValuesType = ValueType.BYTES_FIXED_DEREF; + break; + case 6: + docValuesType = ValueType.BYTES_FIXED_SORTED; + break; + case 7: + docValuesType = ValueType.BYTES_VAR_STRAIGHT; + break; + case 8: + docValuesType = ValueType.BYTES_VAR_DEREF; + break; + case 9: + docValuesType = ValueType.BYTES_VAR_SORTED; + break; + default: + throw new IllegalStateException("unhandled indexValues type " + b); + } + } + final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValuesType); addInternal.setCodecId(codecId); } @@ -669,5 +750,5 @@ } return roFis; } - + } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/Fields.java docvalues/lucene/src/java/org/apache/lucene/index/Fields.java --- trunk_2/lucene/src/java/org/apache/lucene/index/Fields.java 2011-01-13 07:36:22.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/Fields.java 2011-05-17 16:46:44.000000000 +0200 @@ -31,6 +31,6 @@ /** Get the {@link Terms} for this field. This will return * null if the field does not exist. */ public abstract Terms terms(String field) throws IOException; - + public final static Fields[] EMPTY_ARRAY = new Fields[0]; } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/FieldsEnum.java docvalues/lucene/src/java/org/apache/lucene/index/FieldsEnum.java --- trunk_2/lucene/src/java/org/apache/lucene/index/FieldsEnum.java 2010-08-17 14:48:27.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/FieldsEnum.java 2011-06-03 22:42:58.000000000 +0200 @@ -19,6 +19,8 @@ import java.io.IOException; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.ValuesEnum; import org.apache.lucene.util.AttributeSource; /** Enumerates indexed fields. You must first call {@link @@ -55,7 +57,7 @@ * null this method should not be called. This method * will not return null. */ public abstract TermsEnum terms() throws IOException; - + public final static FieldsEnum[] EMPTY_ARRAY = new FieldsEnum[0]; /** Provides zero fields */ diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java docvalues/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java 2011-02-01 17:08:45.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java 2011-05-17 16:46:44.000000000 +0200 @@ -19,7 +19,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -463,4 +463,9 @@ super.removeReaderFinishedListener(listener); in.removeReaderFinishedListener(listener); } + + @Override + public PerDocValues perDocValues() throws IOException { + return in.perDocValues(); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/IndexFileNames.java docvalues/lucene/src/java/org/apache/lucene/index/IndexFileNames.java --- trunk_2/lucene/src/java/org/apache/lucene/index/IndexFileNames.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/IndexFileNames.java 2011-06-03 22:42:58.000000000 +0200 @@ -80,10 +80,9 @@ /** Extension of separate norms */ public static final String SEPARATE_NORMS_EXTENSION = "s"; - + /** Extension of global field numbers */ public static final String GLOBAL_FIELD_NUM_MAP_EXTENSION = "fnx"; - /** * This array contains all filename extensions used by diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/IndexReader.java docvalues/lucene/src/java/org/apache/lucene/index/IndexReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/IndexReader.java 2011-06-04 00:09:03.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/IndexReader.java 2011-06-03 22:42:59.000000000 +0200 @@ -23,6 +23,8 @@ import org.apache.lucene.search.Similarity; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.store.*; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; @@ -174,6 +176,9 @@ public static final FieldOption TERMVECTOR_WITH_OFFSET = new FieldOption ("TERMVECTOR_WITH_OFFSET"); /** All fields with termvectors with offset values and position values enabled */ public static final FieldOption TERMVECTOR_WITH_POSITION_OFFSET = new FieldOption ("TERMVECTOR_WITH_POSITION_OFFSET"); + /** All fields holding doc values */ + public static final FieldOption DOC_VALUES = new FieldOption ("DOC_VALUES"); + } private boolean closed; @@ -1064,6 +1069,21 @@ * using {@link ReaderUtil#gatherSubReaders} and iterate * through them yourself. */ public abstract Fields fields() throws IOException; + + /** + * Flex API: returns {@link PerDocValues} for this reader. + * This method may return null if the reader has no per-document + * values stored. + * + *

NOTE: if this is a multi reader ({@link + * #getSequentialSubReaders} is not null) then this + * method will throw UnsupportedOperationException. If + * you really need {@link PerDocValues} for such a reader, + * use {@link MultiPerDocValues#getPerDocs(IndexReader)}. However, for + * performance reasons, it's best to get all sub-readers + * using {@link ReaderUtil#gatherSubReaders} and iterate + * through them yourself. */ + public abstract PerDocValues perDocValues() throws IOException; public int docFreq(Term term) throws IOException { return docFreq(term.field(), term.bytes()); @@ -1565,7 +1585,14 @@ public int getTermInfosIndexDivisor() { throw new UnsupportedOperationException("This reader does not support this method."); } - + + public IndexDocValues docValues(String field) throws IOException { + final PerDocValues perDoc = perDocValues(); + if (perDoc == null) { + return null; + } + return perDoc.docValues(field); + } private volatile Fields fields; @@ -1578,6 +1605,19 @@ Fields retrieveFields() { return fields; } + + private volatile PerDocValues perDocValues; + + /** @lucene.internal */ + void storePerDoc(PerDocValues perDocValues) { + this.perDocValues = perDocValues; + } + + /** @lucene.internal */ + PerDocValues retrievePerDoc() { + return perDocValues; + } + /** * A struct like class that represents a hierarchical relationship between diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/MultiFields.java docvalues/lucene/src/java/org/apache/lucene/index/MultiFields.java --- trunk_2/lucene/src/java/org/apache/lucene/index/MultiFields.java 2010-12-17 20:14:12.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/MultiFields.java 2011-06-03 22:42:59.000000000 +0200 @@ -21,8 +21,12 @@ import java.util.Map; import java.util.List; import java.util.ArrayList; -import java.util.concurrent.ConcurrentHashMap; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.MultiIndexDocValues; +import org.apache.lucene.index.values.ValueType; +import org.apache.lucene.index.values.MultiIndexDocValues.DocValuesIndex; +import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil.Gather; // for javadocs import org.apache.lucene.util.Bits; @@ -187,7 +191,7 @@ return fields.terms(field); } } - + /** Returns {@link DocsEnum} for the specified field & * term. This may return null if the term does not * exist. */ @@ -271,5 +275,6 @@ return result; } + } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/MultiFieldsEnum.java docvalues/lucene/src/java/org/apache/lucene/index/MultiFieldsEnum.java --- trunk_2/lucene/src/java/org/apache/lucene/index/MultiFieldsEnum.java 2011-03-23 18:54:07.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/MultiFieldsEnum.java 2011-06-03 22:42:59.000000000 +0200 @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.lucene.index.values.MultiIndexDocValues; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ReaderUtil; @@ -38,10 +39,14 @@ // Holds sub-readers containing field we are currently // on, popped from queue. private final FieldsEnumWithSlice[] top; + private final FieldsEnumWithSlice[] enumWithSlices; + private int numTop; // Re-used TermsEnum private final MultiTermsEnum terms; + private final MultiIndexDocValues docValues; + private String currentField; @@ -50,7 +55,9 @@ public MultiFieldsEnum(FieldsEnum[] subs, ReaderUtil.Slice[] subSlices) throws IOException { terms = new MultiTermsEnum(subSlices); queue = new FieldMergeQueue(subs.length); + docValues = new MultiIndexDocValues(); top = new FieldsEnumWithSlice[subs.length]; + List enumWithSlices = new ArrayList(); // Init q for(int i=0;i + * NOTE: for multi readers, you'll get better performance by gathering + * the sub readers using {@link ReaderUtil#gatherSubReaders} and then operate + * per-reader, instead of using this class. + * + * @lucene.experimental + */ +public class MultiPerDocValues extends PerDocValues { + private final PerDocValues[] subs; + private final ReaderUtil.Slice[] subSlices; + private final Map docValues = new ConcurrentHashMap(); + private final TreeSet fields; + + public MultiPerDocValues(PerDocValues[] subs, ReaderUtil.Slice[] subSlices) { + this.subs = subs; + this.subSlices = subSlices; + fields = new TreeSet(); + for (PerDocValues sub : subs) { + fields.addAll(sub.fields()); + } + } + + /** + * Returns a single {@link PerDocValues} instance for this reader, merging + * their values on the fly. This method will not return null. + * + *

+ * NOTE: this is a slow way to access postings. It's better to get the + * sub-readers (using {@link Gather}) and iterate through them yourself. + */ + public static PerDocValues getPerDocs(IndexReader r) throws IOException { + final IndexReader[] subs = r.getSequentialSubReaders(); + if (subs == null) { + // already an atomic reader + return r.perDocValues(); + } else if (subs.length == 0) { + // no fields + return null; + } else if (subs.length == 1) { + return getPerDocs(subs[0]); + } + PerDocValues perDocValues = r.retrievePerDoc(); + if (perDocValues == null) { + + final List producer = new ArrayList(); + final List slices = new ArrayList(); + + new ReaderUtil.Gather(r) { + @Override + protected void add(int base, IndexReader r) throws IOException { + final PerDocValues f = r.perDocValues(); + if (f != null) { + producer.add(f); + slices + .add(new ReaderUtil.Slice(base, r.maxDoc(), producer.size() - 1)); + } + } + }.run(); + + if (producer.size() == 0) { + return null; + } else if (producer.size() == 1) { + perDocValues = producer.get(0); + } else { + perDocValues = new MultiPerDocValues( + producer.toArray(PerDocValues.EMPTY_ARRAY), + slices.toArray(ReaderUtil.Slice.EMPTY_ARRAY)); + } + r.storePerDoc(perDocValues); + } + return perDocValues; + } + + public IndexDocValues docValues(String field) throws IOException { + IndexDocValues result = docValues.get(field); + if (result == null) { + // Lazy init: first time this field is requested, we + // create & add to docValues: + final List docValuesIndex = new ArrayList(); + int docsUpto = 0; + ValueType type = null; + // Gather all sub-readers that share this field + for (int i = 0; i < subs.length; i++) { + IndexDocValues values = subs[i].docValues(field); + final int start = subSlices[i].start; + final int length = subSlices[i].length; + if (values != null) { + if (docsUpto != start) { + type = values.type(); + docValuesIndex.add(new MultiIndexDocValues.DocValuesIndex( + new MultiIndexDocValues.DummyDocValues(start, type), docsUpto, start + - docsUpto)); + } + docValuesIndex.add(new MultiIndexDocValues.DocValuesIndex(values, start, + length)); + docsUpto = start + length; + + } else if (i + 1 == subs.length && !docValuesIndex.isEmpty()) { + docValuesIndex.add(new MultiIndexDocValues.DocValuesIndex( + new MultiIndexDocValues.DummyDocValues(start, type), docsUpto, start + - docsUpto)); + } + } + if (docValuesIndex.isEmpty()) { + return null; + } + result = new MultiIndexDocValues( + docValuesIndex.toArray(DocValuesIndex.EMPTY_ARRAY)); + docValues.put(field, result); + } + return result; + } + + public void close() throws IOException { + final PerDocValues[] perDocValues = this.subs; + IOException ex = null; + for (PerDocValues values : perDocValues) { + try { + values.close(); + } catch (IOException e) { + if (ex == null) { + ex = e; + } + } + } + if (ex != null) { + throw ex; + } + } + + @Override + public Collection fields() { + return fields; + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/MultiReader.java docvalues/lucene/src/java/org/apache/lucene/index/MultiReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/MultiReader.java 2011-02-01 17:08:42.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/MultiReader.java 2011-06-03 22:42:59.000000000 +0200 @@ -24,6 +24,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ReaderUtil; @@ -403,4 +404,9 @@ sub.removeReaderFinishedListener(listener); } } + + @Override + public PerDocValues perDocValues() throws IOException { + throw new UnsupportedOperationException("please use MultiPerDocValues#getPerDocs, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level Fields"); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/ParallelReader.java docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/ParallelReader.java 2011-02-01 17:08:45.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java 2011-05-17 16:46:39.000000000 +0200 @@ -21,6 +21,7 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.MapBackedSet; @@ -180,6 +181,7 @@ return TermsEnum.EMPTY; } } + } // Single instance of this, per ParallelReader instance @@ -187,7 +189,8 @@ final HashMap fields = new HashMap(); public void addField(String field, IndexReader r) throws IOException { - fields.put(field, MultiFields.getFields(r).terms(field)); + Fields multiFields = MultiFields.getFields(r); + fields.put(field, multiFields.terms(field)); } @Override @@ -199,8 +202,8 @@ return fields.get(field); } } - - @Override + + @Override public Bits getDeletedDocs() { return MultiFields.getDeletedDocs(readers.get(0)); } @@ -563,6 +566,12 @@ reader.removeReaderFinishedListener(listener); } } + + @Override + public PerDocValues perDocValues() throws IOException { + // TODO Auto-generated method stub + return null; + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/PerDocWriteState.java docvalues/lucene/src/java/org/apache/lucene/index/PerDocWriteState.java --- trunk_2/lucene/src/java/org/apache/lucene/index/PerDocWriteState.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/PerDocWriteState.java 2011-05-17 16:46:40.000000000 +0200 @@ -0,0 +1,74 @@ +package org.apache.lucene.index; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.PrintStream; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.store.Directory; + +/** + * Encapsulates all necessary state to initiate a {@link PerDocConsumer} and + * create all necessary files in order to consume and merge per-document values. + * + * @lucene.experimental + */ +public class PerDocWriteState { + public final PrintStream infoStream; + public final Directory directory; + public final String segmentName; + public final FieldInfos fieldInfos; + public final AtomicLong bytesUsed; + public final SegmentCodecs segmentCodecs; + public final int codecId; + + PerDocWriteState(PrintStream infoStream, Directory directory, + String segmentName, FieldInfos fieldInfos, AtomicLong bytesUsed, + int codecId) { + this.infoStream = infoStream; + this.directory = directory; + this.segmentName = segmentName; + this.fieldInfos = fieldInfos; + this.segmentCodecs = fieldInfos.buildSegmentCodecs(false); + this.codecId = codecId; + this.bytesUsed = bytesUsed; + } + + PerDocWriteState(SegmentWriteState state) { + infoStream = state.infoStream; + directory = state.directory; + segmentCodecs = state.segmentCodecs; + segmentName = state.segmentName; + fieldInfos = state.fieldInfos; + codecId = state.codecId; + bytesUsed = new AtomicLong(0); + } + + PerDocWriteState(PerDocWriteState state, int codecId) { + this.infoStream = state.infoStream; + this.directory = state.directory; + this.segmentName = state.segmentName; + this.fieldInfos = state.fieldInfos; + this.segmentCodecs = state.segmentCodecs; + this.codecId = codecId; + this.bytesUsed = state.bytesUsed; + } + + public String codecIdAsString() { + return "" + codecId; + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java --- trunk_2/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java 2011-06-03 22:42:59.000000000 +0200 @@ -19,16 +19,22 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; @@ -64,7 +70,7 @@ for (int i = 0; i < codecs.length; i++) { boolean success = false; try { - consumers.add(codecs[i].fieldsConsumer(new SegmentWriteState(state, "" + i))); + consumers.add(codecs[i].fieldsConsumer(new SegmentWriteState(state, i))); success = true; } finally { if (!success) { @@ -99,13 +105,13 @@ boolean success = false; try { for (FieldInfo fi : fieldInfos) { - if (fi.isIndexed) { // TODO this does not work for non-indexed fields + if (fi.isIndexed) { fields.add(fi.name); assert fi.getCodecId() != FieldInfo.UNASSIGNED_CODEC_ID; Codec codec = segmentCodecs.codecs[fi.getCodecId()]; if (!producers.containsKey(codec)) { producers.put(codec, codec.fieldsProducer(new SegmentReadState(dir, - si, fieldInfos, readBufferSize, indexDivisor, ""+fi.getCodecId()))); + si, fieldInfos, readBufferSize, indexDivisor, fi.getCodecId()))); } codecs.put(fi.name, producers.get(codec)); } @@ -120,6 +126,7 @@ } } } + private final class FieldsIterator extends FieldsEnum { private final Iterator it; @@ -161,7 +168,7 @@ FieldsProducer fields = codecs.get(field); return fields == null ? null : fields.terms(field); } - + @Override public void close() throws IOException { IOUtils.closeSafely(false, codecs.values()); @@ -184,7 +191,7 @@ } @Override - public void files(Directory dir, SegmentInfo info, String codecId, Set files) + public void files(Directory dir, SegmentInfo info, int codecId, Set files) throws IOException { // ignore codecid since segmentCodec will assign it per codec segmentCodecs.files(dir, info, files); @@ -196,4 +203,135 @@ codec.getExtensions(extensions); } } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new PerDocConsumers(state); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new PerDocProducers(state.dir, state.fieldInfos, state.segmentInfo, + state.readBufferSize, state.termsIndexDivisor); + } + + private final class PerDocProducers extends PerDocValues { + private final TreeMap codecs = new TreeMap(); + + public PerDocProducers(Directory dir, FieldInfos fieldInfos, SegmentInfo si, + int readBufferSize, int indexDivisor) throws IOException { + final Map producers = new HashMap(); + boolean success = false; + try { + for (FieldInfo fi : fieldInfos) { + if (fi.hasDocValues()) { + assert fi.getCodecId() != FieldInfo.UNASSIGNED_CODEC_ID; + Codec codec = segmentCodecs.codecs[fi.getCodecId()]; + if (!producers.containsKey(codec)) { + producers.put(codec, codec.docsProducer(new SegmentReadState(dir, + si, fieldInfos, readBufferSize, indexDivisor, fi.getCodecId()))); + } + codecs.put(fi.name, producers.get(codec)); + } + } + success = true; + } finally { + if (!success) { + // If we hit exception (eg, IOE because writer was + // committing, or, for any other reason) we must + // go back and close all FieldsProducers we opened: + for(PerDocValues producer : producers.values()) { + try { + producer.close(); + } catch (Throwable t) { + // Suppress all exceptions here so we continue + // to throw the original one + } + } + } + } + } + + @Override + public Collection fields() { + return codecs.keySet(); + } + @Override + public IndexDocValues docValues(String field) throws IOException { + final PerDocValues perDocProducer = codecs.get(field); + if (perDocProducer == null) { + return null; + } + return perDocProducer.docValues(field); + } + + public void close() throws IOException { + final Collection values = codecs.values(); + IOException err = null; + for (PerDocValues perDocValues : values) { + try { + if (perDocValues != null) { + perDocValues.close(); + } + } catch (IOException ioe) { + // keep first IOException we hit but keep + // closing the rest + if (err == null) { + err = ioe; + } + } + } + if (err != null) { + throw err; + } + } + } + + private final class PerDocConsumers extends PerDocConsumer { + private final PerDocConsumer[] consumers; + private final Codec[] codecs; + private final PerDocWriteState state; + + public PerDocConsumers(PerDocWriteState state) throws IOException { + assert segmentCodecs == state.segmentCodecs; + this.state = state; + codecs = segmentCodecs.codecs; + consumers = new PerDocConsumer[codecs.length]; + } + + public void close() throws IOException { + IOException err = null; + for (int i = 0; i < consumers.length; i++) { + try { + final PerDocConsumer next = consumers[i]; + if (next != null) { + next.close(); + } + } catch (IOException ioe) { + // keep first IOException we hit but keep + // closing the rest + if (err == null) { + err = ioe; + } + } + } + if (err != null) { + throw err; + } + } + + @Override + public DocValuesConsumer addValuesField(FieldInfo field) throws IOException { + final int codecId = field.getCodecId(); + assert codecId != FieldInfo.UNASSIGNED_CODEC_ID; + PerDocConsumer perDoc = consumers[codecId]; + if (perDoc == null) { + perDoc = codecs[codecId].docsConsumer(new PerDocWriteState(state, codecId)); + assert perDoc != null; + consumers[codecId] = perDoc; + } + return perDoc.addValuesField(field); + } + + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java 2011-03-27 09:34:45.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentCodecs.java 2011-05-17 16:46:44.000000000 +0200 @@ -102,7 +102,7 @@ throws IOException { final Codec[] codecArray = codecs; for (int i = 0; i < codecArray.length; i++) { - codecArray[i].files(dir, info, ""+i, files); + codecArray[i].files(dir, info, i, files); } } @@ -148,13 +148,6 @@ return this; } - SegmentCodecsBuilder addAll(FieldInfos infos) { - for (FieldInfo fieldInfo : infos) { - tryAddAndSet(fieldInfo); - } - return this; - } - SegmentCodecs build() { return new SegmentCodecs(provider, codecs.toArray(Codec.EMPTY)); } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentCoreReaders.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentCoreReaders.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentCoreReaders.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentCoreReaders.java 2011-06-03 22:42:58.000000000 +0200 @@ -20,7 +20,9 @@ import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; /** Holds core readers that are shared (unchanged) when @@ -39,7 +41,8 @@ final FieldInfos fieldInfos; final FieldsProducer fields; - + final PerDocValues perDocProducer; + final Directory dir; final Directory cfsDir; final int readBufferSize; @@ -51,6 +54,8 @@ TermVectorsReader termVectorsReaderOrig; CompoundFileReader cfsReader; CompoundFileReader storeCFSReader; + + SegmentCoreReaders(SegmentReader owner, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor) throws IOException { @@ -76,11 +81,12 @@ fieldInfos = si.getFieldInfos(); this.termsIndexDivisor = termsIndexDivisor; - + final Codec codec = segmentCodecs.codec(); + final SegmentReadState segmentReadState = new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor); // Ask codec for its Fields - fields = segmentCodecs.codec().fieldsProducer(new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor)); + fields = codec.fieldsProducer(segmentReadState); assert fields != null; - + perDocProducer = codec.docsProducer(segmentReadState); success = true; } finally { if (!success) { @@ -119,6 +125,10 @@ fields.close(); } + if (perDocProducer != null) { + perDocProducer.close(); + } + if (termVectorsReaderOrig != null) { termVectorsReaderOrig.close(); } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentInfo.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentInfo.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java 2011-06-03 22:42:58.000000000 +0200 @@ -17,6 +17,7 @@ * limitations under the License. */ + import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -25,6 +26,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.regex.Pattern; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; @@ -652,7 +654,7 @@ if (delFileName != null && (delGen >= YES || dir.fileExists(delFileName))) { fileSet.add(delFileName); } - + if (normGen != null) { for (Entry entry : normGen.entrySet()) { long gen = entry.getValue(); diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentMerger.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentMerger.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java 2011-06-03 22:42:59.000000000 +0200 @@ -29,6 +29,8 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.MergeState; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -116,7 +118,6 @@ if (fieldInfos.hasVectors()) mergeVectors(); - return mergedDocs; } @@ -154,7 +155,7 @@ for (String field : names) { fInfos.addOrUpdate(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader - .hasNorms(field), storePayloads, omitTFAndPositions); + .hasNorms(field), storePayloads, omitTFAndPositions, null); } } @@ -222,6 +223,7 @@ addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); fieldInfos.addOrUpdate(reader.getFieldNames(FieldOption.UNINDEXED), false); + fieldInfos.addOrUpdate(reader.getFieldNames(FieldOption.DOC_VALUES), false); } } final SegmentCodecs codecInfo = fieldInfos.buildSegmentCodecs(false); @@ -477,9 +479,16 @@ int docBase = 0; final List fields = new ArrayList(); + final List slices = new ArrayList(); final List bits = new ArrayList(); final List bitsStarts = new ArrayList(); + + // TODO: move this into its own method - this merges currently only docvalues + final List perDocProducers = new ArrayList(); + final List perDocSlices = new ArrayList(); + final List perDocBits = new ArrayList(); + final List perDocBitsStarts = new ArrayList(); for(IndexReader r : readers) { final Fields f = r.fields(); @@ -490,10 +499,18 @@ bits.add(r.getDeletedDocs()); bitsStarts.add(docBase); } + final PerDocValues producer = r.perDocValues(); + if (producer != null) { + perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, fields.size())); + perDocProducers.add(producer); + perDocBits.add(r.getDeletedDocs()); + perDocBitsStarts.add(docBase); + } docBase += maxDoc; } bitsStarts.add(docBase); + perDocBitsStarts.add(docBase); // we may gather more readers than mergeState.readerCount mergeState = new MergeState(); @@ -559,6 +576,20 @@ } finally { consumer.close(); } + if (!perDocSlices.isEmpty()) { + mergeState.multiDeletedDocs = new MultiBits(perDocBits, perDocBitsStarts); + final PerDocConsumer docsConsumer = codec + .docsConsumer(new PerDocWriteState(segmentWriteState)); + try { + final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(perDocProducers + .toArray(PerDocValues.EMPTY_ARRAY), perDocSlices + .toArray(ReaderUtil.Slice.EMPTY_ARRAY)); + docsConsumer.merge(mergeState, multiPerDocValues); + } finally { + docsConsumer.close(); + } + } + } private MergeState mergeState; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentReadState.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentReadState.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentReadState.java 2011-01-06 07:03:09.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentReadState.java 2011-05-17 16:46:41.000000000 +0200 @@ -34,11 +34,11 @@ // that must do so), then it should negate this value to // get the app's terms divisor: public int termsIndexDivisor; - public final String codecId; + public final int codecId; public SegmentReadState(Directory dir, SegmentInfo info, FieldInfos fieldInfos, int readBufferSize, int termsIndexDivisor) { - this(dir, info, fieldInfos, readBufferSize, termsIndexDivisor, ""); + this(dir, info, fieldInfos, readBufferSize, termsIndexDivisor, -1); } public SegmentReadState(Directory dir, @@ -46,7 +46,7 @@ FieldInfos fieldInfos, int readBufferSize, int termsIndexDivisor, - String codecId) { + int codecId) { this.dir = dir; this.segmentInfo = info; this.fieldInfos = fieldInfos; @@ -54,4 +54,8 @@ this.termsIndexDivisor = termsIndexDivisor; this.codecId = codecId; } + + public String codecIdAsString() { + return "" + codecId; + } } \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentReader.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentReader.java 2011-06-04 00:09:03.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java 2011-06-03 22:42:58.000000000 +0200 @@ -29,6 +29,8 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -839,4 +841,15 @@ // longer used (all SegmentReaders sharing it have been // closed). } + + + @Override + public IndexDocValues docValues(String field) throws IOException { + return core.perDocProducer.docValues(field); + } + + @Override + public PerDocValues perDocValues() throws IOException { + return core.perDocProducer; + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java 2011-05-28 09:05:02.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java 2011-05-17 16:46:44.000000000 +0200 @@ -43,7 +43,7 @@ public BitVector deletedDocs; final SegmentCodecs segmentCodecs; - public final String codecId; + public final int codecId; /** Expert: The fraction of terms in the "dictionary" which should be stored * in RAM. Smaller values use more memory, but make searching slightly @@ -53,7 +53,7 @@ public int termIndexInterval; // TODO: this should be private to the codec, not settable here or in IWC public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos, - int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, BufferedDeletes segDeletes) { + int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, BufferedDeletes segDeletes) { this.infoStream = infoStream; this.segDeletes = segDeletes; this.directory = directory; @@ -62,13 +62,13 @@ this.numDocs = numDocs; this.termIndexInterval = termIndexInterval; this.segmentCodecs = segmentCodecs; - codecId = ""; + codecId = -1; } /** * Create a shallow {@link SegmentWriteState} copy final a codec ID */ - SegmentWriteState(SegmentWriteState state, String codecId) { + SegmentWriteState(SegmentWriteState state, int codecId) { infoStream = state.infoStream; directory = state.directory; segmentName = state.segmentName; @@ -79,4 +79,8 @@ this.codecId = codecId; segDeletes = state.segDeletes; } + + public String codecIdAsString() { + return "" + codecId; + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java --- trunk_2/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java 2011-01-13 07:36:22.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java 2011-05-17 16:46:44.000000000 +0200 @@ -26,6 +26,7 @@ import org.apache.lucene.index.DirectoryReader; // javadoc import org.apache.lucene.index.MultiReader; // javadoc +import org.apache.lucene.index.codecs.PerDocValues; /** * This class forces a composite reader (eg a {@link @@ -65,6 +66,11 @@ } @Override + public PerDocValues perDocValues() throws IOException { + return MultiPerDocValues.getPerDocs(in); + } + + @Override public Bits getDeletedDocs() { return MultiFields.getDeletedDocs(in); } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java 2011-05-28 09:05:01.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java 2011-06-03 22:42:58.000000000 +0200 @@ -35,6 +35,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs +import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -108,14 +109,14 @@ //private String segment; public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize, - int termsCacheSize, String codecId) + int termsCacheSize, int codecId) throws IOException { this.postingsReader = postingsReader; termsCache = new DoubleBarrelLRUCache(termsCacheSize); //this.segment = segment; - in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION), + in = dir.openInput(IndexFileNames.segmentFileName(segment, ""+codecId, BlockTermsWriter.TERMS_EXTENSION), readBufferSize); boolean success = false; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java 2011-06-03 22:42:58.000000000 +0200 @@ -70,7 +70,7 @@ public BlockTermsWriter(TermsIndexWriterBase termsIndexWriter, SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION); + final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), TERMS_EXTENSION); this.termsIndexWriter = termsIndexWriter; out = state.directory.createOutput(termsFileName); boolean success = false; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/Codec.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/Codec.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/Codec.java 2010-11-15 20:34:12.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/Codec.java 2011-05-17 16:46:43.000000000 +0200 @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; @@ -50,6 +51,10 @@ * returns, it must hold open any files it will need to * use; else, those files may be deleted. */ public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException; + + public abstract PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException; + + public abstract PerDocValues docsProducer(SegmentReadState state) throws IOException; /** * Gathers files associated with this segment @@ -59,7 +64,7 @@ * @param id the codec id within this segment * @param files the of files to add the codec files to. */ - public abstract void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException; + public abstract void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException; /** Records all file extensions this codec uses */ public abstract void getExtensions(Set extensions); diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java 2011-05-28 09:05:01.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java 2011-06-03 22:42:58.000000000 +0200 @@ -22,6 +22,7 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.Map.Entry; /** Holds a set of codecs, keyed by name. You subclass * this, instantiate it, and register your codecs, then @@ -180,4 +181,24 @@ public synchronized void setDefaultFieldCodec(String codec) { defaultFieldCodec = codec; } + + /** + * Registers all codecs from the given provider including the field to codec + * mapping and the default field codec. + *

+ * NOTE: This method will pass any codec from the given codec to + * {@link #register(Codec)} and sets fiels codecs via + * {@link #setFieldCodec(String, String)}. + */ + public void copyFrom(CodecProvider other) { + final Collection values = other.codecs.values(); + for (Codec codec : values) { + register(codec); + } + final Set> entrySet = other.perFieldMap.entrySet(); + for (Entry entry : entrySet) { + setFieldCodec(entry.getKey(), entry.getValue()); + } + setDefaultFieldCodec(other.getDefaultFieldCodec()); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java 2011-05-23 13:19:51.000000000 +0200 @@ -0,0 +1,103 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.PerDocWriteState; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.values.Writer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +public class DefaultDocValuesConsumer extends PerDocConsumer { + private final String segmentName; + private final int codecId; + private final Directory directory; + private final AtomicLong bytesUsed; + private final Comparator comparator; + + public DefaultDocValuesConsumer(PerDocWriteState state, Comparator comparator) { + this.segmentName = state.segmentName; + this.codecId = state.codecId; + this.bytesUsed = state.bytesUsed; + this.directory = state.directory; + this.comparator = comparator; + } + + public void close() throws IOException { + } + + @Override + public DocValuesConsumer addValuesField(FieldInfo field) throws IOException { + return Writer.create(field.getDocValues(), + docValuesId(segmentName, codecId, field.number), + // TODO can we have a compound file per segment and codec for + // docvalues? + directory, comparator, bytesUsed); + } + + public static void files(Directory dir, SegmentInfo segmentInfo, int codecId, + Set files) throws IOException { + FieldInfos fieldInfos = segmentInfo.getFieldInfos(); + for (FieldInfo fieldInfo : fieldInfos) { + if (fieldInfo.getCodecId() == codecId && fieldInfo.hasDocValues()) { + String filename = docValuesId(segmentInfo.name, codecId, + fieldInfo.number); + switch (fieldInfo.getDocValues()) { + case BYTES_FIXED_DEREF: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + case BYTES_VAR_STRAIGHT: + files.add(IndexFileNames.segmentFileName(filename, "", + Writer.INDEX_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", + Writer.INDEX_EXTENSION)); + // until here all types use an index + case BYTES_FIXED_STRAIGHT: + case FLOAT_32: + case FLOAT_64: + case INTS: + files.add(IndexFileNames.segmentFileName(filename, "", + Writer.DATA_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", + Writer.DATA_EXTENSION)); + break; + default: + assert false; + } + } + } + } + + static String docValuesId(String segmentsName, int codecID, int fieldId) { + return segmentsName + "_" + codecID + "-" + fieldId; + } + + public static void getDocValuesExtensions(Set extensions) { + extensions.add(Writer.DATA_EXTENSION); + extensions.add(Writer.INDEX_EXTENSION); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java 2011-06-03 22:42:58.000000000 +0200 @@ -0,0 +1,171 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Collection; +import java.util.TreeMap; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.ValueType; +import org.apache.lucene.store.Directory; + +/** + * Abstract base class for FieldsProducer implementations supporting + * {@link IndexDocValues}. + * + * @lucene.experimental + */ +public class DefaultDocValuesProducer extends PerDocValues { + + protected final TreeMap docValues; + + /** + * Creates a new {@link DefaultDocValuesProducer} instance and loads all + * {@link IndexDocValues} instances for this segment and codec. + * + * @param si + * the segment info to load the {@link IndexDocValues} for. + * @param dir + * the directory to load the {@link IndexDocValues} from. + * @param fieldInfo + * the {@link FieldInfos} + * @param codecId + * the codec ID + * @throws IOException + * if an {@link IOException} occurs + */ + public DefaultDocValuesProducer(SegmentInfo si, Directory dir, + FieldInfos fieldInfo, int codecId) throws IOException { + docValues = load(fieldInfo, si.name, si.docCount, dir, codecId); + } + + /** + * Returns a {@link IndexDocValues} instance for the given field name or + * null if this field has no {@link IndexDocValues}. + */ + @Override + public IndexDocValues docValues(String field) throws IOException { + return docValues.get(field); + } + + // Only opens files... doesn't actually load any values + protected TreeMap load(FieldInfos fieldInfos, + String segment, int docCount, Directory dir, int codecId) + throws IOException { + TreeMap values = new TreeMap(); + boolean success = false; + try { + + for (FieldInfo fieldInfo : fieldInfos) { + if (codecId == fieldInfo.getCodecId() && fieldInfo.hasDocValues()) { + final String field = fieldInfo.name; + // TODO can we have a compound file per segment and codec for + // docvalues? + final String id = DefaultDocValuesConsumer.docValuesId(segment, + codecId, fieldInfo.number); + values.put(field, + loadDocValues(docCount, dir, id, fieldInfo.getDocValues())); + } + } + success = true; + } finally { + if (!success) { + // if we fail we must close all opened resources if there are any + closeDocValues(values.values()); + } + } + return values; + } + + + /** + * Loads a {@link IndexDocValues} instance depending on the given {@link ValueType}. + * Codecs that use different implementations for a certain {@link ValueType} can + * simply override this method and return their custom implementations. + * + * @param docCount + * number of documents in the segment + * @param dir + * the {@link Directory} to load the {@link IndexDocValues} from + * @param id + * the unique file ID within the segment + * @param type + * the type to load + * @return a {@link IndexDocValues} instance for the given type + * @throws IOException + * if an {@link IOException} occurs + * @throws IllegalArgumentException + * if the given {@link ValueType} is not supported + */ + protected IndexDocValues loadDocValues(int docCount, Directory dir, String id, + ValueType type) throws IOException { + switch (type) { + case INTS: + return Ints.getValues(dir, id, false); + case FLOAT_32: + return Floats.getValues(dir, id, docCount); + case FLOAT_64: + return Floats.getValues(dir, id, docCount); + case BYTES_FIXED_STRAIGHT: + return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount); + case BYTES_FIXED_DEREF: + return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount); + case BYTES_FIXED_SORTED: + return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount); + case BYTES_VAR_STRAIGHT: + return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount); + case BYTES_VAR_DEREF: + return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount); + case BYTES_VAR_SORTED: + return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount); + default: + throw new IllegalStateException("unrecognized index values mode " + type); + } + } + + public void close() throws IOException { + closeDocValues(docValues.values()); + } + + private void closeDocValues(final Collection values) + throws IOException { + IOException ex = null; + for (IndexDocValues docValues : values) { + try { + docValues.close(); + } catch (IOException e) { + ex = e; + } + } + if (ex != null) { + throw ex; + } + } + + @Override + public Collection fields() { + return docValues.keySet(); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DocValuesConsumer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/DocValuesConsumer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/DocValuesConsumer.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/DocValuesConsumer.java 2011-06-03 22:42:58.000000000 +0200 @@ -0,0 +1,167 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.PerDocFieldValues; +import org.apache.lucene.index.values.Writer; +import org.apache.lucene.util.Bits; + +/** + * Abstract API that consumes {@link PerDocFieldValues}. + * {@link DocValuesConsumer} are always associated with a specific field and + * segments. Concrete implementations of this API write the given + * {@link PerDocFieldValues} into a implementation specific format depending on + * the fields meta-data. + * + * @lucene.experimental + */ +public abstract class DocValuesConsumer { + // TODO this might need to go in the codec package since is a direct relative + // to TermsConsumer + protected final AtomicLong bytesUsed; + + /** + * Creates a new {@link DocValuesConsumer}. + * + * @param bytesUsed + * bytes-usage tracking reference used by implementation to track + * internally allocated memory. All tracked bytes must be released + * once {@link #finish(int)} has been called. + */ + protected DocValuesConsumer(AtomicLong bytesUsed) { + this.bytesUsed = bytesUsed == null ? new AtomicLong(0) : bytesUsed; + } + + /** + * Adds the given {@link PerDocFieldValues} instance to this + * {@link DocValuesConsumer} + * + * @param docID + * the document ID to add the value for. The docID must always + * increase or be 0 if it is the first call to this method. + * @param docValues + * the values to add + * @throws IOException + * if an {@link IOException} occurs + */ + public abstract void add(int docID, PerDocFieldValues docValues) + throws IOException; + + /** + * Called when the consumer of this API is doc with adding + * {@link PerDocFieldValues} to this {@link DocValuesConsumer} + * + * @param docCount + * the total number of documents in this {@link DocValuesConsumer}. + * Must be greater than or equal the last given docID to + * {@link #add(int, PerDocFieldValues)}. + * @throws IOException + */ + public abstract void finish(int docCount) throws IOException; + + /** + * Gathers files associated with this {@link DocValuesConsumer} + * + * @param files + * the of files to add the consumers files to. + */ + public abstract void files(Collection files) throws IOException; + + /** + * Merges the given {@link org.apache.lucene.index.codecs.MergeState} into + * this {@link DocValuesConsumer}. + * + * @param mergeState + * the state to merge + * @param values + * the docValues to merge in + * @throws IOException + * if an {@link IOException} occurs + */ + public void merge(org.apache.lucene.index.codecs.MergeState mergeState, + IndexDocValues values) throws IOException { + assert mergeState != null; + // TODO we need some kind of compatibility notation for values such + // that two slightly different segments can be merged eg. fixed vs. + // variable byte len or float32 vs. float64 + int docBase = 0; + boolean merged = false; + /* + * We ignore the given DocValues here and merge from the subReaders directly + * to support bulk copies on the DocValues Writer level. if this gets merged + * with MultiDocValues the writer can not optimize for bulk-copyable data + */ + for (final IndexReader reader : mergeState.readers) { + final IndexDocValues r = reader.docValues(mergeState.fieldInfo.name); + if (r != null) { + merged = true; + merge(new Writer.MergeState(r, docBase, reader.maxDoc(), reader + .getDeletedDocs())); + } + docBase += reader.numDocs(); + } + if (merged) { + finish(mergeState.mergedDocCount); + } + } + + /** + * Merges the given {@link MergeState} into this {@link DocValuesConsumer}. + * {@link MergeState#docBase} must always be increasing. Merging segments out + * of order is not supported. + * + * @param mergeState + * the {@link MergeState} to merge + * @throws IOException + * if an {@link IOException} occurs + */ + protected abstract void merge(MergeState mergeState) throws IOException; + + /** + * Specialized auxiliary MergeState is necessary since we don't want to + * exploit internals up to the codecs consumer. An instance of this class is + * created for each merged low level {@link IndexReader} we are merging to + * support low level bulk copies. + */ + public static class MergeState { + /** + * the source reader for this MergeState - merged values should be read from + * this instance + */ + public final IndexDocValues reader; + /** the absolute docBase for this MergeState within the resulting segment */ + public final int docBase; + /** the number of documents in this MergeState */ + public final int docCount; + /** the deleted bits for this MergeState */ + public final Bits bits; + + public MergeState(IndexDocValues reader, int docBase, int docCount, Bits bits) { + assert reader != null; + this.reader = reader; + this.docBase = docBase; + this.docCount = docCount; + this.bits = bits; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java 2010-08-17 14:45:27.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java 2011-05-17 16:46:43.000000000 +0200 @@ -20,6 +20,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.TermsEnum; import java.io.IOException; import java.io.Closeable; @@ -35,7 +36,7 @@ /** Add a new field */ public abstract TermsConsumer addField(FieldInfo field) throws IOException; - + /** Called when we are done adding everything. */ public abstract void close() throws IOException; @@ -45,8 +46,13 @@ String field; while((field = fieldsEnum.next()) != null) { mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field); - final TermsConsumer termsConsumer = addField(mergeState.fieldInfo); - termsConsumer.merge(mergeState, fieldsEnum.terms()); + assert mergeState.fieldInfo != null : "FieldInfo for field is null: "+ field; + TermsEnum terms = fieldsEnum.terms(); + if(terms != null) { + final TermsConsumer termsConsumer = addField(mergeState.fieldInfo); + termsConsumer.merge(mergeState, terms); + } } } + } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java 2010-08-17 14:45:27.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/FieldsProducer.java 2011-05-17 16:46:43.000000000 +0200 @@ -17,10 +17,12 @@ * limitations under the License. */ -import org.apache.lucene.index.Fields; - -import java.io.IOException; import java.io.Closeable; +import java.io.IOException; + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.Terms; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -33,4 +35,28 @@ public abstract class FieldsProducer extends Fields implements Closeable { public abstract void close() throws IOException; public abstract void loadTermsIndex(int indexDivisor) throws IOException; + + public static final FieldsProducer EMPTY = new FieldsProducer() { + + @Override + public Terms terms(String field) throws IOException { + return null; + } + + @Override + public FieldsEnum iterator() throws IOException { + return FieldsEnum.EMPTY; + } + + @Override + public void loadTermsIndex(int indexDivisor) throws IOException { + + } + + @Override + public void close() throws IOException { + + } + }; + } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java 2011-06-03 22:42:58.000000000 +0200 @@ -68,12 +68,12 @@ // start of the field info data protected long dirOffset; - public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp, String codecId) + public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp, int codecId) throws IOException { this.termComp = termComp; - in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION)); + in = dir.openInput(IndexFileNames.segmentFileName(segment, ""+codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION)); boolean success = false; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java 2011-06-03 22:42:58.000000000 +0200 @@ -56,7 +56,7 @@ private final FieldInfos fieldInfos; // unread public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException { - final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION); + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), TERMS_INDEX_EXTENSION); termIndexInterval = state.termIndexInterval; out = state.directory.createOutput(indexFileName); boolean success = false; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/PerDocConsumer.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/PerDocConsumer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/PerDocConsumer.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/PerDocConsumer.java 2011-06-03 22:42:58.000000000 +0200 @@ -0,0 +1,67 @@ +package org.apache.lucene.index.codecs; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +import java.io.Closeable; +import java.io.IOException; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.values.IndexDocValues; + +/** + * Abstract API that consumes per document values. Concrete implementations of + * this convert field values into a Codec specific format during indexing. + *

+ * The {@link PerDocConsumer} API is accessible through flexible indexing / the + * {@link Codec} - API providing per field consumers and producers for inverted + * data (terms, postings) as well as per-document data. + * + * @lucene.experimental + */ +public abstract class PerDocConsumer implements Closeable{ + /** Adds a new DocValuesField */ + public abstract DocValuesConsumer addValuesField(FieldInfo field) + throws IOException; + + /** + * Consumes and merges the given {@link PerDocValues} producer + * into this consumers format. + */ + public void merge(MergeState mergeState, PerDocValues producer) + throws IOException { + Iterable fields = producer.fields(); + for (String field : fields) { + mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field); + assert mergeState.fieldInfo != null : "FieldInfo for field is null: " + + field; + if (mergeState.fieldInfo.hasDocValues()) { + final IndexDocValues docValues = producer.docValues(field); + if (docValues == null) { + /* + * It is actually possible that a fieldInfo has a values type but no + * values are actually available. this can happen if there are already + * segments without values around. + */ + continue; + } + final DocValuesConsumer docValuesConsumer = addValuesField(mergeState.fieldInfo); + assert docValuesConsumer != null; + docValuesConsumer.merge(mergeState, docValues); + } + } + + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/PerDocValues.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/PerDocValues.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/PerDocValues.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/PerDocValues.java 2011-06-03 22:42:58.000000000 +0200 @@ -0,0 +1,55 @@ +package org.apache.lucene.index.codecs; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Closeable; +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.index.values.IndexDocValues; + +/** + * Abstract API that provides access to one or more per-document storage + * features. The concrete implementations provide access to the underlying + * storage on a per-document basis corresponding to their actual + * {@link PerDocConsumer} counterpart. + *

+ * The {@link PerDocValues} API is accessible through flexible indexing / the + * {@link Codec} - API providing per field consumers and producers for inverted + * data (terms, postings) as well as per-document data. + * + * @lucene.experimental + */ +public abstract class PerDocValues implements Closeable { + /** + * Returns {@link IndexDocValues} for the current field. + * + * @param field + * the field name + * @return the {@link IndexDocValues} for this field or null if not + * applicable. + * @throws IOException + */ + public abstract IndexDocValues docValues(String field) throws IOException; + + public static final PerDocValues[] EMPTY_ARRAY = new PerDocValues[0]; + + /** + * Returns all fields this {@link PerDocValues} contains values for. + */ + public abstract Collection fields(); + +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java 2011-06-03 22:42:58.000000000 +0200 @@ -57,11 +57,10 @@ protected long dirOffset; final String segment; - - public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, String codecId) + public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, int codecId) throws IOException { - in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION)); + in = dir.openInput(IndexFileNames.segmentFileName(segment, ""+codecId, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION)); this.segment = segment; boolean success = false; @@ -159,15 +158,11 @@ private final class FieldIndexData { - private final FieldInfo fieldInfo; private final long indexStart; - // Set only if terms index is loaded: private volatile FST fst; public FieldIndexData(FieldInfo fieldInfo, long indexStart) throws IOException { - - this.fieldInfo = fieldInfo; this.indexStart = indexStart; if (indexDivisor > 0) { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java 2011-06-03 22:42:58.000000000 +0200 @@ -158,7 +158,7 @@ // in the extremes. public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException { - final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION); + final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), TERMS_INDEX_EXTENSION); out = state.directory.createOutput(indexFileName); boolean success = false; try { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java 2010-11-30 12:59:55.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java 2011-05-17 16:46:43.000000000 +0200 @@ -22,11 +22,14 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.PerDocValues; /** Codec that reads the pre-flex-indexing postings * format. It does not provide a writer because newly @@ -66,7 +69,7 @@ } @Override - public void files(Directory dir, SegmentInfo info, String id, Set files) throws IOException { + public void files(Directory dir, SegmentInfo info, int id, Set files) throws IOException { // preflex fields have no codec ID - we ignore it here PreFlexFields.files(dir, info, files); } @@ -78,4 +81,14 @@ extensions.add(TERMS_EXTENSION); extensions.add(TERMS_INDEX_EXTENSION); } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + throw new UnsupportedOperationException("PerDocConsumer is not supported by Preflex codec"); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + throw new UnsupportedOperationException("PerDocValues is not supported by Preflex codec"); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java 2011-06-03 22:42:58.000000000 +0200 @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; @@ -28,8 +29,12 @@ import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.VariableGapTermsIndexReader; import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter; import org.apache.lucene.index.codecs.BlockTermsReader; @@ -38,6 +43,7 @@ import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; /** This codec "inlines" the postings for terms that have @@ -147,14 +153,27 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, id, files); - BlockTermsReader.files(dir, segmentInfo, id, files); - VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); + public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { + final String codecId = "" + id; + StandardPostingsReader.files(dir, segmentInfo, codecId, files); + BlockTermsReader.files(dir, segmentInfo, codecId, files); + VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); } @Override public void getExtensions(Set extensions) { StandardCodec.getStandardExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java 2011-05-28 09:05:01.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java 2011-05-17 16:46:43.000000000 +0200 @@ -58,20 +58,20 @@ int maxSkipLevels; int skipMinimum; - public SepPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, IntStreamFactory intFactory, String codecId) throws IOException { - + public SepPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, IntStreamFactory intFactory, int codecId) throws IOException { + final String codecIdAsString = "" + codecId; boolean success = false; try { - final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.DOC_EXTENSION); + final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SepPostingsWriterImpl.DOC_EXTENSION); docIn = intFactory.openInput(dir, docFileName); - skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.SKIP_EXTENSION), readBufferSize); + skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SepPostingsWriterImpl.SKIP_EXTENSION), readBufferSize); if (segmentInfo.getHasProx()) { - freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.FREQ_EXTENSION)); - posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.POS_EXTENSION), readBufferSize); - payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.PAYLOAD_EXTENSION), readBufferSize); + freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SepPostingsWriterImpl.FREQ_EXTENSION)); + posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SepPostingsWriterImpl.POS_EXTENSION), readBufferSize); + payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SepPostingsWriterImpl.PAYLOAD_EXTENSION), readBufferSize); } else { posIn = null; payloadIn = null; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java 2011-06-03 22:42:58.000000000 +0200 @@ -117,25 +117,25 @@ try { this.skipInterval = skipInterval; this.skipMinimum = skipInterval; /* set to the same for now */ - final String docFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, DOC_EXTENSION); + final String docFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), DOC_EXTENSION); docOut = factory.createOutput(state.directory, docFileName); docIndex = docOut.index(); if (state.fieldInfos.hasProx()) { - final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, FREQ_EXTENSION); + final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), FREQ_EXTENSION); freqOut = factory.createOutput(state.directory, frqFileName); freqIndex = freqOut.index(); - final String posFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, POS_EXTENSION); + final String posFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), POS_EXTENSION); posOut = factory.createOutput(state.directory, posFileName); posIndex = posOut.index(); // TODO: -- only if at least one field stores payloads? - final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, PAYLOAD_EXTENSION); + final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), PAYLOAD_EXTENSION); payloadOut = state.directory.createOutput(payloadFileName); } - final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, SKIP_EXTENSION); + final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), SKIP_EXTENSION); skipOut = state.directory.createOutput(skipFileName); totalNumDocs = state.numDocs; diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java 2010-12-09 19:59:28.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java 2011-06-04 00:22:11.000000000 +0200 @@ -20,14 +20,21 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; /** For debugging, curiosity, transparency only!! Do not * use this codec in production. @@ -61,12 +68,25 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { - files.add(getPostingsFileName(segmentInfo.name, id)); + public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { + files.add(getPostingsFileName(segmentInfo.name, ""+id)); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); } @Override public void getExtensions(Set extensions) { extensions.add(POSTINGS_EXTENSION); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + } + + // TODO: would be great if these used a plain text impl + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java 2011-06-03 22:42:58.000000000 +0200 @@ -45,7 +45,7 @@ final static BytesRef PAYLOAD = new BytesRef(" payload "); public SimpleTextFieldsWriter(SegmentWriteState state) throws IOException { - final String fileName = SimpleTextCodec.getPostingsFileName(state.segmentName, state.codecId); + final String fileName = SimpleTextCodec.getPostingsFileName(state.segmentName, state.codecIdAsString()); out = state.directory.createOutput(fileName); } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java 2011-02-21 00:17:08.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java 2011-05-17 16:46:43.000000000 +0200 @@ -20,12 +20,17 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DocValuesConsumer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; @@ -34,7 +39,10 @@ import org.apache.lucene.index.codecs.VariableGapTermsIndexReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.BlockTermsReader; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; +import org.apache.lucene.index.values.Writer; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; /** Default codec. * @lucene.experimental */ @@ -130,15 +138,18 @@ static final String PROX_EXTENSION = "prx"; @Override - public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, id, files); - BlockTermsReader.files(dir, segmentInfo, id, files); - VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); + public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { + final String codecId = "" + id; + StandardPostingsReader.files(dir, segmentInfo, codecId, files); + BlockTermsReader.files(dir, segmentInfo, codecId, files); + VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); } @Override public void getExtensions(Set extensions) { getStandardExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); } public static void getStandardExtensions(Set extensions) { @@ -147,4 +158,14 @@ BlockTermsReader.getExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions); } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java 2011-02-21 00:17:08.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java 2011-05-17 16:46:43.000000000 +0200 @@ -51,14 +51,14 @@ //private String segment; - public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, String codecId) throws IOException { - freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.FREQ_EXTENSION), + public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, int codecId) throws IOException { + freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, ""+codecId, StandardCodec.FREQ_EXTENSION), readBufferSize); //this.segment = segmentInfo.name; if (segmentInfo.getHasProx()) { boolean success = false; try { - proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.PROX_EXTENSION), + proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, ""+codecId, StandardCodec.PROX_EXTENSION), readBufferSize); success = true; } finally { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java --- trunk_2/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java 2011-05-29 22:19:08.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java 2011-06-03 22:42:58.000000000 +0200 @@ -91,14 +91,14 @@ this.skipInterval = skipInterval; this.skipMinimum = skipInterval; /* set to the same for now */ //this.segment = state.segmentName; - String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.FREQ_EXTENSION); + String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), StandardCodec.FREQ_EXTENSION); freqOut = state.directory.createOutput(fileName); boolean success = false; try { if (state.fieldInfos.hasProx()) { // At least one field does not omit TF, so create the // prox file - fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.PROX_EXTENSION); + fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), StandardCodec.PROX_EXTENSION); proxOut = state.directory.createOutput(fileName); } else { // Every field omits TF so we will write no prox file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/Bytes.java docvalues/lucene/src/java/org/apache/lucene/index/values/Bytes.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/Bytes.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/Bytes.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,491 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Base class for specific Bytes Reader/Writer implementations */ +import java.io.IOException; +import java.util.Collection; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.IndexDocValues.SortedSource; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.index.values.IndexDocValues.SourceEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.PagedBytes; + +/** + * Provides concrete Writer/Reader implementations for byte[] value per + * document. There are 6 package-private default implementations of this, for + * all combinations of {@link Mode#DEREF}/{@link Mode#STRAIGHT}/ + * {@link Mode#SORTED} x fixed-length/variable-length. + * + *

+ * NOTE: Currently the total amount of byte[] data stored (across a single + * segment) cannot exceed 2GB. + *

+ *

+ * NOTE: Each byte[] must be <= 32768 bytes in length + *

+ * + * @lucene.experimental + */ +public final class Bytes { + // TODO - add bulk copy where possible + private Bytes() { /* don't instantiate! */ + } + + /** + * Defines the {@link Writer}s store mode. The writer will either store the + * bytes sequentially ({@link #STRAIGHT}, dereferenced ({@link #DEREF}) or + * sorted ({@link #SORTED}) + * + * @lucene.experimental + */ + public static enum Mode { + /** + * Mode for sequentially stored bytes + */ + STRAIGHT, + /** + * Mode for dereferenced stored bytes + */ + DEREF, + /** + * Mode for sorted stored bytes + */ + SORTED + }; + + /** + * Creates a new byte[] {@link Writer} instances for the given + * directory. + * + * @param dir + * the directory to write the values to + * @param id + * the id used to create a unique file name. Usually composed out of + * the segment name and a unique id per segment. + * @param mode + * the writers store mode + * @param comp + * a {@link BytesRef} comparator - only used with {@link Mode#SORTED} + * @param fixedSize + * true if all bytes subsequently passed to the + * {@link Writer} will have the same length + * @param bytesUsed + * an {@link AtomicLong} instance to track the used bytes within the + * {@link Writer}. A call to {@link Writer#finish(int)} will release + * all internally used resources and frees the memeory tracking + * reference. + * @return a new {@link Writer} instance + * @throws IOException + * if the files for the writer can not be created. + */ + public static Writer getWriter(Directory dir, String id, Mode mode, + Comparator comp, boolean fixedSize, AtomicLong bytesUsed) + throws IOException { + // TODO -- i shouldn't have to specify fixed? can + // track itself & do the write thing at write time? + if (comp == null) { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesImpl.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesImpl.Writer(dir, id, bytesUsed); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Writer(dir, id, comp, bytesUsed); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Writer(dir, id, bytesUsed); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Writer(dir, id, bytesUsed); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Writer(dir, id, comp, bytesUsed); + } + } + + throw new IllegalArgumentException(""); + } + + /** + * Creates a new {@link IndexDocValues} instance that provides either memory + * resident or iterative access to a per-document stored byte[] + * value. The returned {@link IndexDocValues} instance will be initialized without + * consuming a significant amount of memory. + * + * @param dir + * the directory to load the {@link IndexDocValues} from. + * @param id + * the file ID in the {@link Directory} to load the values from. + * @param mode + * the mode used to store the values + * @param fixedSize + * true iff the values are stored with fixed-size, + * otherwise false + * @param maxDoc + * the number of document values stored for the given ID + * @return an initialized {@link IndexDocValues} instance. + * @throws IOException + * if an {@link IOException} occurs + */ + public static IndexDocValues getValues(Directory dir, String id, Mode mode, + boolean fixedSize, int maxDoc) throws IOException { + // TODO -- I can peek @ header to determing fixed/mode? + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Reader(dir, id, maxDoc); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Reader(dir, id, maxDoc); + } + } + + throw new IllegalArgumentException("Illegal Mode: " + mode); + } + + // TODO open up this API? + static abstract class BytesBaseSource extends Source { + private final PagedBytes pagedBytes; + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final static int PAGED_BYTES_BITS = 15; + protected final PagedBytes.Reader data; + protected final long totalLengthInBytes; + + protected BytesBaseSource(IndexInput datIn, IndexInput idxIn, + PagedBytes pagedBytes, long bytesToRead) throws IOException { + assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: " + + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer(); + this.datIn = datIn; + this.totalLengthInBytes = bytesToRead; + this.pagedBytes = pagedBytes; + this.pagedBytes.copy(datIn, bytesToRead); + data = pagedBytes.freeze(true); + this.idxIn = idxIn; + } + + public void close() throws IOException { + try { + data.close(); // close data + } finally { + try { + if (datIn != null) { + datIn.close(); + } + } finally { + if (idxIn != null) {// if straight - no index needed + idxIn.close(); + } + } + } + } + + /** + * Returns one greater than the largest possible document number. + */ + protected abstract int maxDoc(); + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return new SourceEnum(attrSource, type(), this, maxDoc()) { + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) { + return pos = NO_MORE_DOCS; + } + while (source.getBytes(target, bytesRef).length == 0) { + if (++target >= numDocs) { + return pos = NO_MORE_DOCS; + } + } + return pos = target; + } + }; + } + + } + + static abstract class BytesBaseSortedSource extends SortedSource { + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final BytesRef defaultValue = new BytesRef(); + protected final static int PAGED_BYTES_BITS = 15; + private final PagedBytes pagedBytes; + protected final PagedBytes.Reader data; + private final Comparator comp; + + protected BytesBaseSortedSource(IndexInput datIn, IndexInput idxIn, + Comparator comp, PagedBytes pagedBytes, long bytesToRead) + throws IOException { + assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: " + + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer(); + this.datIn = datIn; + this.pagedBytes = pagedBytes; + this.pagedBytes.copy(datIn, bytesToRead); + data = pagedBytes.freeze(true); + this.idxIn = idxIn; + this.comp = comp == null ? BytesRef.getUTF8SortedAsUnicodeComparator() + : comp; + + } + + @Override + public BytesRef getByOrd(int ord, BytesRef bytesRef) { + assert ord >= 0; + return deref(ord, bytesRef); + } + + protected void closeIndexInput() throws IOException { + try { + if (datIn != null) { + datIn.close(); + } + } finally { + if (idxIn != null) {// if straight + idxIn.close(); + } + } + } + + /** + * Returns the largest doc id + 1 in this doc values source + */ + protected abstract int maxDoc(); + + /** + * Copies the value for the given ord to the given {@link BytesRef} and + * returns it. + */ + protected abstract BytesRef deref(int ord, BytesRef bytesRef); + + protected int binarySearch(BytesRef b, BytesRef bytesRef, int low, + int high) { + int mid = 0; + while (low <= high) { + mid = (low + high) >>> 1; + deref(mid, bytesRef); + final int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + assert comp.compare(bytesRef, b) != 0; + return -(low + 1); + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return new SourceEnum(attrSource, type(), this, maxDoc()) { + + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) { + return pos = NO_MORE_DOCS; + } + while (source.getBytes(target, bytesRef).length == 0) { + if (++target >= numDocs) { + return pos = NO_MORE_DOCS; + } + } + return pos = target; + } + }; + } + } + + // TODO: open up this API?! + static abstract class BytesWriterBase extends Writer { + private final String id; + protected IndexOutput idxOut; + protected IndexOutput datOut; + protected BytesRef bytesRef; + protected final ByteBlockPool pool; + + protected BytesWriterBase(Directory dir, String id, String codecName, + int version, boolean initIndex, ByteBlockPool pool, + AtomicLong bytesUsed) throws IOException { + super(bytesUsed); + this.id = id; + this.pool = pool; + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + DATA_EXTENSION)); + boolean success = false; + try { + CodecUtil.writeHeader(datOut, codecName, version); + if (initIndex) { + idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + INDEX_EXTENSION)); + CodecUtil.writeHeader(idxOut, codecName, version); + } else { + idxOut = null; + } + success = true; + } finally { + if (!success) { + IOUtils.closeSafely(true, datOut, idxOut); + } + } + } + + /** + * Must be called only with increasing docIDs. It's OK for some docIDs to be + * skipped; they will be filled with 0 bytes. + */ + @Override + public abstract void add(int docID, BytesRef bytes) throws IOException; + + @Override + public void finish(int docCount) throws IOException { + try { + IOUtils.closeSafely(false, datOut, idxOut); + } finally { + if (pool != null) { + pool.reset(); + } + } + } + + @Override + protected void add(int docID) throws IOException { + add(docID, bytesRef); + } + + @Override + public void add(int docID, PerDocFieldValues docValues) throws IOException { + final BytesRef ref; + if ((ref = docValues.getBytes()) != null) { + add(docID, ref); + } + } + + @Override + protected void setNextEnum(ValuesEnum valuesEnum) { + bytesRef = valuesEnum.bytes(); + } + + @Override + public void files(Collection files) throws IOException { + assert datOut != null; + files.add(IndexFileNames.segmentFileName(id, "", DATA_EXTENSION)); + if (idxOut != null) { // called after flush - so this must be initialized + // if needed or present + final String idxFile = IndexFileNames.segmentFileName(id, "", + INDEX_EXTENSION); + files.add(idxFile); + } + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static abstract class BytesReaderBase extends IndexDocValues { + protected final IndexInput idxIn; + protected final IndexInput datIn; + protected final int version; + protected final String id; + + protected BytesReaderBase(Directory dir, String id, String codecName, + int maxVersion, boolean doIndex) throws IOException { + this.id = id; + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + Writer.DATA_EXTENSION)); + boolean success = false; + try { + version = CodecUtil.checkHeader(datIn, codecName, maxVersion, maxVersion); + if (doIndex) { + idxIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + Writer.INDEX_EXTENSION)); + final int version2 = CodecUtil.checkHeader(idxIn, codecName, + maxVersion, maxVersion); + assert version == version2; + } else { + idxIn = null; + } + success = true; + } finally { + if (!success) { + closeInternal(); + } + } + } + + /** + * clones and returns the data {@link IndexInput} + */ + protected final IndexInput cloneData() { + assert datIn != null; + return (IndexInput) datIn.clone(); + } + + /** + * clones and returns the indexing {@link IndexInput} + */ + protected final IndexInput cloneIndex() { + assert idxIn != null; + return (IndexInput) idxIn.clone(); + } + + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + closeInternal(); + } + } + + private void closeInternal() throws IOException { + try { + datIn.close(); + } finally { + if (idxIn != null) { + idxIn.close(); + } + } + } + } + +} \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,279 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; +import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] +/** + * @lucene.experimental + */ +class FixedDerefBytesImpl { + + static final String CODEC_NAME = "FixedDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private int[] docToID; + private final BytesRefHash hash = new BytesRefHash(pool, + BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray( + BytesRefHash.DEFAULT_CAPACITY, bytesUsed)); + public Writer(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + this(dir, id, new DirectTrackingAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), + bytesUsed); + } + + public Writer(Directory dir, String id, Allocator allocator, + AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true, + new ByteBlockPool(allocator), bytesUsed); + docToID = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); // TODO BytesRefHash + // uses bytes too! + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) // default value - skip it + return; + if (size == -1) { + size = bytes.length; + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + + " but got " + bytes.length); + } + int ord = hash.add(bytes); + + if (ord >= 0) { + // new added entry + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } else { + ord = (-ord) - 1; + } + + if (docID >= docToID.length) { + final int size = docToID.length; + docToID = ArrayUtil.grow(docToID, 1 + docID); + bytesUsed.addAndGet((docToID.length - size) + * RamUsageEstimator.NUM_BYTES_INT); + } + docToID[docID] = 1 + ord; + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + try { + if (size == -1) { + datOut.writeInt(size); + } + final int count = 1 + hash.size(); + idxOut.writeInt(count - 1); + // write index + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(count - 1)); + final int limit = docCount > docToID.length ? docToID.length : docCount; + for (int i = 0; i < limit; i++) { + w.add(docToID[i]); + } + // fill up remaining doc with zeros + for (int i = limit; i < docCount; i++) { + w.add(0); + } + w.finish(); + } finally { + hash.close(); + super.finish(docCount); + bytesUsed + .addAndGet((-docToID.length) * RamUsageEstimator.NUM_BYTES_INT); + docToID = null; + } + } + } + + public static class Reader extends BytesReaderBase { + private final int size; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + size = datIn.readInt(); + } + + @Override + public Source load() throws IOException { + final IndexInput index = cloneIndex(); + return new Source(cloneData(), index, size, index.readInt()); + } + + private static class Source extends BytesBaseSource { + private final PackedInts.Reader index; + private final int size; + private final int numValues; + + protected Source(IndexInput datIn, IndexInput idxIn, int size, + int numValues) throws IOException { + super(datIn, idxIn, new PagedBytes(PAGED_BYTES_BITS), size * numValues); + this.size = size; + this.numValues = numValues; + index = PackedInts.getReader(idxIn); + } + + @Override + public BytesRef getBytes(int docID, BytesRef bytesRef) { + final int id = (int) index.get(docID); + if (id == 0) { + bytesRef.length = 0; + return bytesRef; + } + return data.fillSlice(bytesRef, ((id - 1) * size), size); + } + + @Override + public int getValueCount() { + return numValues; + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_DEREF; + } + + @Override + protected int maxDoc() { + return index.size(); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new DerefBytesEnum(source, cloneData(), cloneIndex(), size); + } + + static class DerefBytesEnum extends ValuesEnum { + protected final IndexInput datIn; + private final PackedInts.ReaderIterator idx; + protected final long fp; + private final int size; + private final int valueCount; + private int pos = -1; + + public DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, int size) throws IOException { + this(source, datIn, idxIn, size, ValueType.BYTES_FIXED_DEREF); + } + + protected DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, int size, ValueType enumType) throws IOException { + super(source, enumType); + this.datIn = datIn; + this.size = size; + idxIn.readInt();// read valueCount + idx = PackedInts.getReaderIterator(idxIn); + fp = datIn.getFilePointer(); + bytesRef.grow(this.size); + bytesRef.length = this.size; + bytesRef.offset = 0; + valueCount = idx.size(); + } + + protected void copyFrom(ValuesEnum valuesEnum) { + bytesRef = valuesEnum.bytesRef; + if (bytesRef.bytes.length < size) { + bytesRef.grow(size); + } + bytesRef.length = size; + bytesRef.offset = 0; + } + + @Override + public int advance(int target) throws IOException { + if (target < valueCount) { + long address; + while ((address = idx.advance(target)) == 0) { + if (++target >= valueCount) { + return pos = NO_MORE_DOCS; + } + } + pos = idx.ord(); + fill(address, bytesRef); + return pos; + } + return pos = NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= valueCount) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + + public void close() throws IOException { + try { + datIn.close(); + } finally { + idx.close(); + } + } + + protected void fill(long address, BytesRef ref) throws IOException { + datIn.seek(fp + ((address - 1) * size)); + datIn.readBytes(ref.bytes, 0, size); + ref.length = size; + ref.offset = 0; + } + + @Override + public int docID() { + return pos; + } + + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_DEREF; + } + } + +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,242 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; +import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +/** + * @lucene.experimental + */ +class FixedSortedBytesImpl { + + static final String CODEC_NAME = "FixedSortedBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private int[] docToEntry; + private final Comparator comp; + + private final BytesRefHash hash = new BytesRefHash(pool, + BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray( + BytesRefHash.DEFAULT_CAPACITY, bytesUsed)); + + public Writer(Directory dir, String id, Comparator comp, + AtomicLong bytesUsed) throws IOException { + this(dir, id, comp, new DirectTrackingAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), + bytesUsed); + } + + public Writer(Directory dir, String id, Comparator comp, + Allocator allocator, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true, + new ByteBlockPool(allocator), bytesUsed); + docToEntry = new int[1]; + // docToEntry[0] = -1; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + this.comp = comp; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) + return; // default - skip it + if (size == -1) { + size = bytes.length; + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + + " but got " + bytes.length); + } + if (docID >= docToEntry.length) { + final int[] newArray = new int[ArrayUtil.oversize(1 + docID, + RamUsageEstimator.NUM_BYTES_INT)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + bytesUsed.addAndGet((newArray.length - docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + docToEntry = newArray; + } + int e = hash.add(bytes); + docToEntry[docID] = 1 + (e < 0 ? (-e) - 1 : e); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + try { + if (size == -1) {// no data added + datOut.writeInt(size); + } + final int[] sortedEntries = hash.sort(comp); + final int count = hash.size(); + int[] address = new int[count]; + // first dump bytes data, recording address as we go + for (int i = 0; i < count; i++) { + final int e = sortedEntries[i]; + final BytesRef bytes = hash.get(e, new BytesRef()); + assert bytes.length == size; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address[e] = 1 + i; + } + + idxOut.writeInt(count); + + // next write index + PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(count)); + final int limit; + if (docCount > docToEntry.length) { + limit = docToEntry.length; + } else { + limit = docCount; + } + for (int i = 0; i < limit; i++) { + final int e = docToEntry[i]; + if (e == 0) { + // null is encoded as zero + w.add(0); + } else { + assert e > 0 && e <= count : "index must 0 > && <= " + count + + " was: " + e; + w.add(address[e - 1]); + } + } + + for (int i = limit; i < docCount; i++) { + w.add(0); + } + w.finish(); + } finally { + super.finish(docCount); + bytesUsed.addAndGet((-docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + docToEntry = null; + hash.close(); + } + } + } + + public static class Reader extends BytesReaderBase { + private final int size; + + public Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + size = datIn.readInt(); + } + + @Override + public org.apache.lucene.index.values.IndexDocValues.Source load() + throws IOException { + return loadSorted(null); + } + + @Override + public SortedSource loadSorted(Comparator comp) + throws IOException { + final IndexInput idxInput = cloneIndex(); + final IndexInput datInput = cloneData(); + datInput.seek(CodecUtil.headerLength(CODEC_NAME) + 4); + idxInput.seek(CodecUtil.headerLength(CODEC_NAME)); + return new Source(datInput, idxInput, size, idxInput.readInt(), comp); + } + + private static class Source extends BytesBaseSortedSource { + + private final PackedInts.Reader index; + private final int numValue; + private final int size; + + public Source(IndexInput datIn, IndexInput idxIn, int size, + int numValues, Comparator comp) throws IOException { + super(datIn, idxIn, comp, new PagedBytes(PAGED_BYTES_BITS), size + * numValues); + this.size = size; + this.numValue = numValues; + index = PackedInts.getReader(idxIn); + closeIndexInput(); + } + + @Override + public int ord(int docID) { + return (int) index.get(docID) -1; + } + + @Override + public int getByValue(BytesRef bytes, BytesRef tmpRef) { + return binarySearch(bytes, tmpRef, 0, numValue - 1); + } + + @Override + public int getValueCount() { + return numValue; + } + + @Override + protected BytesRef deref(int ord, BytesRef bytesRef) { + return data.fillSlice(bytesRef, (ord * size), size); + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_SORTED; + } + + @Override + protected int maxDoc() { + return index.size(); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + // do unsorted + return new DerefBytesEnum(source, cloneData(), cloneIndex(), size); + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_SORTED; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,249 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; + +// Simplest storage: stores fixed length byte[] per +// document, with no dedup and no sorting. +/** + * @lucene.experimental + */ +class FixedStraightBytesImpl { + + static final String CODEC_NAME = "FixedStraightBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + // start at -1 if the first added value is > 0 + private int lastDocID = -1; + private byte[] oneRecord; + + protected Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, null, null); + } + + // TODO - impl bulk copy here! + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (size == -1) { + size = bytes.length; + datOut.writeInt(size); + oneRecord = new byte[size]; + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + + " but got " + bytes.length); + } + fill(docID); + assert bytes.bytes.length >= bytes.length; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.index.values.Writer#merge(org.apache.lucene.index.values + * .Writer.MergeState) + */ + @Override + protected void merge(MergeState state) throws IOException { + if (state.bits == null && state.reader instanceof Reader) { + Reader reader = (Reader) state.reader; + final int maxDocs = reader.maxDoc; + if (maxDocs == 0) + return; + if (size == -1) { + size = reader.size; + datOut.writeInt(size); + oneRecord = new byte[size]; + } + fill(state.docBase); + // TODO should we add a transfer to API to each reader? + final IndexInput cloneData = reader.cloneData(); + try { + datOut.copyBytes(cloneData, size * maxDocs); + } finally { + cloneData.close(); + } + + lastDocID += maxDocs - 1; + } else + super.merge(state); + } + + // Fills up to but not including this docID + private void fill(int docID) throws IOException { + assert size >= 0; + for (int i = lastDocID + 1; i < docID; i++) { + datOut.writeBytes(oneRecord, size); + } + lastDocID = docID; + } + + @Override + public void finish(int docCount) throws IOException { + try { + if (size == -1) {// no data added + datOut.writeInt(0); + } else { + fill(docCount); + } + } finally { + super.finish(docCount); + } + } + + public long ramBytesUsed() { + return oneRecord == null ? 0 : oneRecord.length; + } + + } + + public static class Reader extends BytesReaderBase { + private final int size; + private final int maxDoc; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, false); + size = datIn.readInt(); + this.maxDoc = maxDoc; + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), size, maxDoc); + } + + @Override + public void close() throws IOException { + datIn.close(); + } + + private static class Source extends BytesBaseSource { + private final int size; + private final int maxDoc; + + public Source(IndexInput datIn, int size, int maxDoc) + throws IOException { + super(datIn, null, new PagedBytes(PAGED_BYTES_BITS), size * maxDoc); + this.size = size; + this.maxDoc = maxDoc; + } + + @Override + public BytesRef getBytes(int docID, BytesRef bytesRef) { + return data.fillSlice(bytesRef, docID * size, size); + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_STRAIGHT; + } + + @Override + protected int maxDoc() { + return maxDoc; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new FixedStraightBytesEnum(source, cloneData(), size, maxDoc); + } + + private static final class FixedStraightBytesEnum extends ValuesEnum { + private final IndexInput datIn; + private final int size; + private final int maxDoc; + private int pos = -1; + private final long fp; + + public FixedStraightBytesEnum(AttributeSource source, IndexInput datIn, + int size, int maxDoc) throws IOException { + super(source, ValueType.BYTES_FIXED_STRAIGHT); + this.datIn = datIn; + this.size = size; + this.maxDoc = maxDoc; + bytesRef.grow(size); + bytesRef.length = size; + bytesRef.offset = 0; + fp = datIn.getFilePointer(); + } + + protected void copyFrom(ValuesEnum valuesEnum) { + bytesRef = valuesEnum.bytesRef; + if (bytesRef.bytes.length < size) { + bytesRef.grow(size); + } + bytesRef.length = size; + bytesRef.offset = 0; + } + + public void close() throws IOException { + datIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc || size == 0) { + return pos = NO_MORE_DOCS; + } + if ((target - 1) != pos) // pos inc == 1 + datIn.seek(fp + target * size); + datIn.readBytes(bytesRef.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + + @Override + public ValueType type() { + return ValueType.BYTES_FIXED_STRAIGHT; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/Floats.java docvalues/lucene/src/java/org/apache/lucene/index/values/Floats.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/Floats.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/Floats.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,469 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.IOUtils; + +/** + * Exposes {@link Writer} and reader ({@link Source}) for 32 bit and 64 bit + * floating point values. + *

+ * Current implementations store either 4 byte or 8 byte floating points with + * full precision without any compression. + * + * @lucene.experimental + */ +public class Floats { + // TODO - add bulk copy where possible + private static final String CODEC_NAME = "SimpleFloats"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + private static final int INT_DEFAULT = Float + .floatToRawIntBits(0.0f); + private static final long LONG_DEFAULT = Double + .doubleToRawLongBits(0.0d); + + + public static Writer getWriter(Directory dir, String id, int precisionBytes, + AtomicLong bytesUsed) throws IOException { + if (precisionBytes != 4 && precisionBytes != 8) { + throw new IllegalArgumentException("precisionBytes must be 4 or 8; got " + + precisionBytes); + } + if (precisionBytes == 4) { + return new Float4Writer(dir, id, bytesUsed); + } else { + return new Float8Writer(dir, id, bytesUsed); + } + } + + public static IndexDocValues getValues(Directory dir, String id, int maxDoc) + throws IOException { + return new FloatsReader(dir, id, maxDoc); + } + + abstract static class FloatsWriter extends Writer { + private final String id; + private FloatsRef floatsRef; + protected int lastDocId = -1; + protected IndexOutput datOut; + private final byte precision; + + protected FloatsWriter(Directory dir, String id, int precision, + AtomicLong bytesUsed) throws IOException { + super(bytesUsed); + this.id = id; + this.precision = (byte) precision; + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + Writer.DATA_EXTENSION)); + boolean success = false; + try { + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + assert datOut.getFilePointer() == CodecUtil.headerLength(CODEC_NAME); + datOut.writeByte(this.precision); + success = true; + } finally { + if (!success) { + IOUtils.closeSafely(true, datOut); + } + } + } + + + public long ramBytesUsed() { + return 0; + } + + @Override + protected void add(int docID) throws IOException { + add(docID, floatsRef.get()); + } + + @Override + public void add(int docID, PerDocFieldValues docValues) throws IOException { + add(docID, docValues.getFloat()); + } + + @Override + protected void setNextEnum(ValuesEnum valuesEnum) { + floatsRef = valuesEnum.getFloat(); + } + + protected abstract int fillDefault(int num) throws IOException; + + @Override + protected void merge(MergeState state) throws IOException { + if (state.bits == null && state.reader instanceof FloatsReader) { + // no deletes - bulk copy + // TODO: should be do bulks with deletes too? + final FloatsReader reader = (FloatsReader) state.reader; + assert reader.precisionBytes == (int) precision; + if (reader.maxDoc == 0) + return; + final int docBase = state.docBase; + if (docBase - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docBase - lastDocId - 1); + } + lastDocId += reader.transferTo(datOut); + } else + super.merge(state); + } + + @Override + public void files(Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", Writer.DATA_EXTENSION)); + } + + } + + // Writes 4 bytes (float) per value + static class Float4Writer extends FloatsWriter { + + protected Float4Writer(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + super(dir, id, 4, bytesUsed); + } + + @Override + public void add(final int docID, final double v) + throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeInt(Float.floatToRawIntBits((float) v)); + ++lastDocId; + } + + @Override + public void finish(int docCount) throws IOException { + try { + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeInt(INT_DEFAULT); // default value + } + } finally { + datOut.close(); + } + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeInt(INT_DEFAULT); + } + return numValues; + } + } + + // Writes 8 bytes (double) per value + static class Float8Writer extends FloatsWriter { + + protected Float8Writer(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + super(dir, id, 8, bytesUsed); + } + + @Override + public void add(int docID, double v) throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeLong(Double.doubleToRawLongBits(v)); + ++lastDocId; + } + + @Override + public void finish(int docCount) throws IOException { + try { + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeLong(LONG_DEFAULT); // default value + } + } finally { + datOut.close(); + } + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeLong(LONG_DEFAULT); + } + return numValues; + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class FloatsReader extends IndexDocValues { + + private final IndexInput datIn; + private final int precisionBytes; + // TODO(simonw) is ByteBuffer the way to go here? + private final int maxDoc; + + protected FloatsReader(Directory dir, String id, int maxDoc) + throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + Writer.DATA_EXTENSION)); + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + precisionBytes = datIn.readByte(); + assert precisionBytes == 4 || precisionBytes == 8; + this.maxDoc = maxDoc; + } + + int transferTo(IndexOutput out) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + try { + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + out.copyBytes(indexInput, precisionBytes * maxDoc); + } finally { + indexInput.close(); + } + return maxDoc; + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + /* we always read BIG_ENDIAN here since the writer uses + * DataOutput#writeInt() / writeLong() we can simply read the ints / longs + * back in using readInt / readLong */ + final IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + if (precisionBytes == 4) { + final float[] values = new float[(4 * maxDoc) >> 2]; + assert values.length == maxDoc; + for (int i = 0; i < values.length; i++) { + values[i] = Float.intBitsToFloat(indexInput.readInt()); + } + return new Source4(values); + } else { + final double[] values = new double[(8 * maxDoc) >> 3]; + assert values.length == maxDoc; + for (int i = 0; i < values.length; i++) { + values[i] = Double.longBitsToDouble(indexInput.readLong()); + } + return new Source8(values); + } + } + + private class Source4 extends Source { + private final float[] values; + + Source4(final float[] values ) throws IOException { + this.values = values; + } + + @Override + public double getFloat(int docID) { + return values[docID]; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) + throws IOException { + return new SourceEnum(attrSource, ValueType.FLOAT_32, this, maxDoc) { + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) + return pos = NO_MORE_DOCS; + floatsRef.floats[floatsRef.offset] = source.getFloat(target); + return pos = target; + } + }; + } + + @Override + public ValueType type() { + return ValueType.FLOAT_32; + } + } + + private class Source8 extends Source { + private final double[] values; + + Source8(final double[] values) throws IOException { + this.values = values; + } + + @Override + public double getFloat(int docID) { + return values[docID]; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) + throws IOException { + return new SourceEnum(attrSource, type(), this, maxDoc) { + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) + return pos = NO_MORE_DOCS; + floatsRef.floats[floatsRef.offset] = source.getFloat(target); + return pos = target; + } + }; + } + + @Override + public ValueType type() { + return ValueType.FLOAT_64; + } + } + + @Override + public void close() throws IOException { + super.close(); + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + return precisionBytes == 4 ? new Floats4Enum(source, indexInput, maxDoc) + : new Floats8EnumImpl(source, indexInput, maxDoc); + } + + @Override + public ValueType type() { + return precisionBytes == 4 ? ValueType.FLOAT_32 + : ValueType.FLOAT_64; + } + } + + static final class Floats4Enum extends FloatsEnumImpl { + + Floats4Enum(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 4, maxDoc, ValueType.FLOAT_32); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + dataIn.seek(fp + (target * precision)); + final int intBits = dataIn.readInt(); + floatsRef.floats[0] = Float.intBitsToFloat(intBits); + floatsRef.offset = 0; + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + + private static final class Floats8EnumImpl extends FloatsEnumImpl { + + Floats8EnumImpl(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 8, maxDoc, ValueType.FLOAT_64); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return pos = NO_MORE_DOCS; + } + dataIn.seek(fp + (target * precision)); + final long value = dataIn.readLong(); + floatsRef.floats[floatsRef.offset] = Double.longBitsToDouble(value); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + + static abstract class FloatsEnumImpl extends ValuesEnum { + protected final IndexInput dataIn; + protected int pos = -1; + protected final int precision; + protected final int maxDoc; + protected final long fp; + + FloatsEnumImpl(AttributeSource source, IndexInput dataIn, int precision, + int maxDoc, ValueType type) throws IOException { + super(source, precision == 4 ? ValueType.FLOAT_32 + : ValueType.FLOAT_64); + this.dataIn = dataIn; + this.precision = precision; + this.maxDoc = maxDoc; + fp = dataIn.getFilePointer(); + floatsRef.offset = 0; + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + } +} \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java docvalues/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,364 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Closeable; +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.document.IndexDocValuesField; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; + +/** + * {@link IndexDocValues} provides a dense per-document typed storage for fast + * value access based on the lucene internal document id. {@link IndexDocValues} + * exposes two distinct APIs: + *

    + *
  • via {@link Source} an entirely RAM resident API for random access
  • + *
  • via {@link ValuesEnum} a disk resident API for sequential access
  • + *
{@link IndexDocValues} are exposed via + * {@link IndexReader#perDocValues()} on a per-segment basis. For best + * performance {@link IndexDocValues} should be consumed per-segment just like + * IndexReader. + *

+ * {@link IndexDocValues} are fully integrated into the {@link Codec} API. + * Custom implementations can be exposed on a per field basis via + * {@link CodecProvider}. + * + * @see ValueType for limitations and default implementation documentation + * @see IndexDocValuesField for adding values to the index + * @see Codec#docsConsumer(org.apache.lucene.index.PerDocWriteState) for + * customization + * @lucene.experimental + */ +public abstract class IndexDocValues implements Closeable { + /* + * TODO: it might be useful to add another Random Access enum for some + * implementations like packed ints and only return such a random access enum + * if the impl supports random access. For super large segments it might be + * useful or even required in certain environements to have disc based random + * access + */ + public static final IndexDocValues[] EMPTY_ARRAY = new IndexDocValues[0]; + + private SourceCache cache = new SourceCache.DirectSourceCache(); + + /** + * Returns an iterator that steps through all documents values for this + * {@link IndexDocValues} field instance. {@link ValuesEnum} will skip document + * without a value if applicable. + */ + public ValuesEnum getEnum() throws IOException { + return getEnum(null); + } + + /** + * Returns an iterator that steps through all documents values for this + * {@link IndexDocValues} field instance. {@link ValuesEnum} will skip document + * without a value if applicable. + *

+ * If an {@link AttributeSource} is supplied to this method the + * {@link ValuesEnum} will use the given source to access implementation + * related attributes. + */ + public abstract ValuesEnum getEnum(AttributeSource attrSource) + throws IOException; + + /** + * Loads a new {@link Source} instance for this {@link IndexDocValues} field + * instance. Source instances returned from this method are not cached. It is + * the callers responsibility to maintain the instance and release its + * resources once the source is not needed anymore. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link SortedSource}. + *

+ * For managed {@link Source} instances see {@link #getSource()}. + * + * @see #getSource() + * @see #setCache(SourceCache) + */ + public abstract Source load() throws IOException; + + /** + * Returns a {@link Source} instance through the current {@link SourceCache}. + * Iff no {@link Source} has been loaded into the cache so far the source will + * be loaded through {@link #load()} and passed to the {@link SourceCache}. + * The caller of this method should not close the obtained {@link Source} + * instance unless it is not needed for the rest of its life time. + *

+ * {@link Source} instances obtained from this method are closed / released + * from the cache once this {@link IndexDocValues} instance is closed by the + * {@link IndexReader}, {@link Fields} or {@link FieldsEnum} the + * {@link IndexDocValues} was created from. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link SortedSource}. + */ + public Source getSource() throws IOException { + return cache.load(this); + } + + /** + * Returns a {@link SortedSource} instance for this {@link IndexDocValues} field + * instance like {@link #getSource()}. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link Source} instead of a {@link SortedSource}. + */ + public SortedSource getSortedSorted(Comparator comparator) + throws IOException { + return cache.loadSorted(this, comparator); + } + + /** + * Loads and returns a {@link SortedSource} instance for this + * {@link IndexDocValues} field instance like {@link #load()}. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link Source} instead of a {@link SortedSource}. + */ + public SortedSource loadSorted(Comparator comparator) + throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Returns the {@link ValueType} of this {@link IndexDocValues} instance + */ + public abstract ValueType type(); + + /** + * Closes this {@link IndexDocValues} instance. This method should only be called + * by the creator of this {@link IndexDocValues} instance. API users should not + * close {@link IndexDocValues} instances. + */ + public void close() throws IOException { + cache.close(this); + } + + /** + * Sets the {@link SourceCache} used by this {@link IndexDocValues} instance. This + * method should be called before {@link #load()} or + * {@link #loadSorted(Comparator)} is called. All {@link Source} or + * {@link SortedSource} instances in the currently used cache will be closed + * before the new cache is installed. + *

+ * Note: All instances previously obtained from {@link #load()} or + * {@link #loadSorted(Comparator)} will be closed. + * + * @throws IllegalArgumentException + * if the given cache is null + * + */ + public void setCache(SourceCache cache) { + if (cache == null) + throw new IllegalArgumentException("cache must not be null"); + synchronized (this.cache) { + this.cache.close(this); + this.cache = cache; + } + } + + /** + * Source of per document values like long, double or {@link BytesRef} + * depending on the {@link IndexDocValues} fields {@link ValueType}. Source + * implementations provide random access semantics similar to array lookups + * and typically are entirely memory resident. + *

+ * {@link Source} defines 3 {@link ValueType} //TODO finish this + */ + public static abstract class Source { + + /** + * Returns a long for the given document id or throws an + * {@link UnsupportedOperationException} if this source doesn't support + * long values. + * + * @throws UnsupportedOperationException + * if this source doesn't support long values. + */ + public long getInt(int docID) { + throw new UnsupportedOperationException("ints are not supported"); + } + + /** + * Returns a double for the given document id or throws an + * {@link UnsupportedOperationException} if this source doesn't support + * double values. + * + * @throws UnsupportedOperationException + * if this source doesn't support double values. + */ + public double getFloat(int docID) { + throw new UnsupportedOperationException("floats are not supported"); + } + + /** + * Returns a {@link BytesRef} for the given document id or throws an + * {@link UnsupportedOperationException} if this source doesn't support + * byte[] values. + * + * @throws UnsupportedOperationException + * if this source doesn't support byte[] values. + */ + public BytesRef getBytes(int docID, BytesRef ref) { + throw new UnsupportedOperationException("bytes are not supported"); + } + + /** + * Returns number of unique values. Some implementations may throw + * UnsupportedOperationException. + */ + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + /** + * Returns a {@link ValuesEnum} for this source. + */ + public ValuesEnum getEnum() throws IOException { + return getEnum(null); + } + + /** + * Returns the {@link ValueType} of this source. + * + * @return the {@link ValueType} of this source. + */ + public abstract ValueType type(); + + /** + * Returns a {@link ValuesEnum} for this source which uses the given + * {@link AttributeSource}. + */ + public abstract ValuesEnum getEnum(AttributeSource attrSource) + throws IOException; + } + + /** + * {@link ValuesEnum} utility for {@link Source} implemenations. + * + */ + public abstract static class SourceEnum extends ValuesEnum { + protected final Source source; + protected final int numDocs; + protected int pos = -1; + + /** + * Creates a new {@link SourceEnum} + * + * @param attrs + * the {@link AttributeSource} for this enum + * @param type + * the enums {@link ValueType} + * @param source + * the source this enum operates on + * @param numDocs + * the number of documents within the source + */ + protected SourceEnum(AttributeSource attrs, ValueType type, Source source, + int numDocs) { + super(attrs, type); + this.source = source; + this.numDocs = numDocs; + } + + @Override + public void close() throws IOException { + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos == NO_MORE_DOCS) + return NO_MORE_DOCS; + return advance(pos + 1); + } + } + + /** + * A sorted variant of {@link Source} for byte[] values per document. + *

+ * Note: {@link ValuesEnum} obtained from a {@link SortedSource} will + * enumerate values in document order and not in sorted order. + */ + public static abstract class SortedSource extends Source { + + @Override + public BytesRef getBytes(int docID, BytesRef bytesRef) { + final int ord = ord(docID); + if (ord < 0) { + bytesRef.length = 0; + } else { + getByOrd(ord , bytesRef); + } + return bytesRef; + } + + /** + * Returns ord for specified docID. If this docID had not been added to the + * Writer, the ord is 0. Ord is dense, ie, starts at 0, then increments by 1 + * for the next (as defined by {@link Comparator} value. + */ + public abstract int ord(int docID); + + /** Returns value for specified ord. */ + public abstract BytesRef getByOrd(int ord, BytesRef bytesRef); + + + /** + * Finds the ordinal whose value is greater or equal to the given value. + * + * @return the given values ordinal if found or otherwise + * (-(ord)-1), defined as the ordinal of the first + * element that is greater than the given value. This guarantees + * that the return value will always be >= 0 if the given value + * is found. + * + */ + public final int getByValue(BytesRef value) { + return getByValue(value, new BytesRef()); + } + + /** + * Performs a lookup by value. + * + * @param value + * the value to look up + * @param tmpRef + * a temporary {@link BytesRef} instance used to compare internal + * values to the given value. Must not be null + * @return the given values ordinal if found or otherwise + * (-(ord)-1), defined as the ordinal of the first + * element that is greater than the given value. This guarantees + * that the return value will always be >= 0 if the given value + * is found. + */ + public abstract int getByValue(BytesRef value, BytesRef tmpRef); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/Ints.java docvalues/lucene/src/java/org/apache/lucene/index/values/Ints.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/Ints.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/Ints.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,46 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.IntsImpl.IntsReader; +import org.apache.lucene.index.values.IntsImpl.IntsWriter; +import org.apache.lucene.store.Directory; + +/** + * @lucene.experimental + */ +public class Ints { + // TODO - add bulk copy where possible + + private Ints() { + } + + public static Writer getWriter(Directory dir, String id, + boolean useFixedArray, AtomicLong bytesUsed) throws IOException { + // TODO - implement fixed?! + return new IntsWriter(dir, id, bytesUsed); + } + + public static IndexDocValues getValues(Directory dir, String id, + boolean useFixedArray) throws IOException { + return new IntsReader(dir, id); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,430 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Stores ints packed with fixed-bit precision. + * + * @lucene.experimental + * */ +class IntsImpl { + + private static final String CODEC_NAME = "Ints"; + private static final byte PACKED = 0x00; + private static final byte FIXED = 0x01; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class IntsWriter extends Writer { + + // TODO: can we bulkcopy this on a merge? + private LongsRef intsRef; + private long[] docToValue; + private long minValue; + private long maxValue; + private boolean started; + private final String id; + private int lastDocId = -1; + private IndexOutput datOut; + + protected IntsWriter(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + super(bytesUsed); + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + DATA_EXTENSION)); + boolean success = false; + try { + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + this.id = id; + docToValue = new long[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG); // TODO the + // bitset + // needs memory + // too + success = true; + } finally { + if (!success) { + datOut.close(); + } + } + } + + @Override + public void add(int docID, long v) throws IOException { + assert lastDocId < docID; + if (!started) { + started = true; + minValue = maxValue = v; + } else { + if (v < minValue) { + minValue = v; + } else if (v > maxValue) { + maxValue = v; + } + } + lastDocId = docID; + + if (docID >= docToValue.length) { + final long len = docToValue.length; + docToValue = ArrayUtil.grow(docToValue, 1 + docID); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG + * ((docToValue.length) - len)); + } + docToValue[docID] = v; + } + + @Override + public void finish(int docCount) throws IOException { + try { + if (!started) { + minValue = maxValue = 0; + } + // if we exceed the range of positive longs we must switch to fixed ints + if ((maxValue - minValue) < (((long)1) << 63) && (maxValue - minValue) > 0) { + writePackedInts(docCount); + } else { + writeFixedInts(docCount); + } + + } finally { + datOut.close(); + bytesUsed + .addAndGet(-(RamUsageEstimator.NUM_BYTES_LONG * docToValue.length)); + docToValue = null; + } + } + + private void writeFixedInts(int docCount) throws IOException { + datOut.writeByte(FIXED); + datOut.writeInt(docCount); + for (int i = 0; i < docToValue.length; i++) { + datOut.writeLong(docToValue[i]); // write full array - we use 0 as default + } + for (int i = docToValue.length; i < docCount; i++) { + datOut.writeLong(0); // fill with defaults values + } + } + + private void writePackedInts(int docCount) throws IOException { + datOut.writeByte(PACKED); + // TODO -- long can't work right since it's signed + datOut.writeLong(minValue); + // write a default value to recognize docs without a value for that + // field + final long defaultValue = maxValue>= 0 && minValue <=0 ? 0-minValue : ++maxValue-minValue; + datOut.writeLong(defaultValue); + PackedInts.Writer w = PackedInts.getWriter(datOut, docCount, + PackedInts.bitsRequired(maxValue-minValue)); + final int limit = docToValue.length > docCount ? docCount : docToValue.length; + for (int i = 0; i < limit; i++) { + w.add(docToValue[i] == 0 ? defaultValue : docToValue[i] - minValue); + } + for (int i = limit; i < docCount; i++) { + w.add(defaultValue); + } + + w.finish(); + } + + @Override + protected void add(int docID) throws IOException { + add(docID, intsRef.get()); + } + + @Override + protected void setNextEnum(ValuesEnum valuesEnum) { + intsRef = valuesEnum.getInt(); + } + + @Override + public void add(int docID, PerDocFieldValues docValues) throws IOException { + add(docID, docValues.getInt()); + } + + @Override + public void files(Collection files) throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", DATA_EXTENSION)); + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class IntsReader extends IndexDocValues { + private final IndexInput datIn; + private final boolean packed; + + protected IntsReader(Directory dir, String id) throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + Writer.DATA_EXTENSION)); + boolean success = false; + try { + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + packed = PACKED == datIn.readByte(); + success = true; + } finally { + if (!success) { + IOUtils.closeSafely(true, datIn); + } + } + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + final IndexInput input = (IndexInput) datIn.clone(); + boolean success = false; + try { + final Source source = packed ? new PackedIntsSource(input) + : new FixedIntsSource(input); + success = true; + return source; + } finally { + if (!success) { + IOUtils.closeSafely(true, datIn); + } + } + } + + private static class FixedIntsSource extends Source { + private final long[] values; + public FixedIntsSource(IndexInput dataIn) throws IOException { + dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + final int numDocs = dataIn.readInt(); + values = new long[numDocs]; + for (int i = 0; i < values.length; i++) { + values[i] = dataIn.readLong(); + } + } + + @Override + public long getInt(int docID) { + assert docID >= 0 && docID < values.length; + return values[docID]; + } + + @Override + public ValueType type() { + return ValueType.INTS; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) + throws IOException { + return new SourceEnum(attrSource, type(), this, values.length) { + + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) + return pos = NO_MORE_DOCS; + intsRef.ints[intsRef.offset] = values[target]; + return pos = target; + } + }; + } + + } + + private static class PackedIntsSource extends Source { + private final long minValue; + private final long defaultValue; + private final PackedInts.Reader values; + + public PackedIntsSource(IndexInput dataIn) throws IOException { + dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + values = PackedInts.getReader(dataIn); + } + + @Override + public long getInt(int docID) { + // TODO -- can we somehow avoid 2X method calls + // on each get? must push minValue down, and make + // PackedInts implement Ints.Source + assert docID >= 0; + final long value = values.get(docID); + return value == defaultValue ? 0 : minValue + value; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) + throws IOException { + return new SourceEnum(attrSource, type(), this, values.size()) { + @Override + public int advance(int target) throws IOException { + if (target >= numDocs) + return pos = NO_MORE_DOCS; + intsRef.ints[intsRef.offset] = source.getInt(target); + return pos = target; + } + }; + } + + @Override + public ValueType type() { + return ValueType.INTS; + } + } + + @Override + public void close() throws IOException { + super.close(); + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + final IndexInput input = (IndexInput) datIn.clone(); + boolean success = false; + try { + ValuesEnum inst = packed ? new PackedIntsEnumImpl(source, input) + : new FixedIntsEnumImpl(source, input); + success = true; + return inst; + } finally { + if (!success) { + IOUtils.closeSafely(true, input); + } + } + } + + @Override + public ValueType type() { + return ValueType.INTS; + } + + } + + private static final class PackedIntsEnumImpl extends ValuesEnum { + private final PackedInts.ReaderIterator ints; + private long minValue; + private final IndexInput dataIn; + private final long defaultValue; + private final int maxDoc; + private int pos = -1; + + private PackedIntsEnumImpl(AttributeSource source, IndexInput dataIn) + throws IOException { + super(source, ValueType.INTS); + intsRef.offset = 0; + this.dataIn = dataIn; + dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + this.ints = PackedInts.getReaderIterator(dataIn); + maxDoc = ints.size(); + } + + @Override + public void close() throws IOException { + ints.close(); + dataIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return pos = NO_MORE_DOCS; + } + final long val = ints.advance(target); + intsRef.ints[intsRef.offset] = val == defaultValue ? 0 : minValue + val; + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + + private static final class FixedIntsEnumImpl extends ValuesEnum { + private final IndexInput dataIn; + private final int maxDoc; + private int pos = -1; + + private FixedIntsEnumImpl(AttributeSource source, IndexInput dataIn) + throws IOException { + super(source, ValueType.INTS); + intsRef.offset = 0; + this.dataIn = dataIn; + dataIn.seek(CodecUtil.headerLength(CODEC_NAME) + 1); + maxDoc = dataIn.readInt(); + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + return pos = NO_MORE_DOCS; + } + assert target > pos; + if (target > pos+1) { + dataIn.seek(dataIn.getFilePointer() + ((target - pos - 1) * 8)); + } + intsRef.ints[intsRef.offset] = dataIn.readLong(); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= maxDoc) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + +} \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/MultiIndexDocValues.java docvalues/lucene/src/java/org/apache/lucene/index/values/MultiIndexDocValues.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/MultiIndexDocValues.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/MultiIndexDocValues.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,278 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ReaderUtil; + +/** + * A wrapper for compound IndexReader providing access to per segment + * {@link IndexDocValues} + * + * @lucene.experimental + */ +public class MultiIndexDocValues extends IndexDocValues { + + public static class DocValuesIndex { + public final static DocValuesIndex[] EMPTY_ARRAY = new DocValuesIndex[0]; + final int start; + final int length; + final IndexDocValues docValues; + + public DocValuesIndex(IndexDocValues docValues, int start, int length) { + this.docValues = docValues; + this.start = start; + this.length = length; + } + } + + private DocValuesIndex[] docValuesIdx; + private int[] starts; + + public MultiIndexDocValues() { + starts = new int[0]; + docValuesIdx = new DocValuesIndex[0]; + } + + public MultiIndexDocValues(DocValuesIndex[] docValuesIdx) { + reset(docValuesIdx); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new MultiValuesEnum(docValuesIdx, starts); + } + + @Override + public Source load() throws IOException { + return new MultiSource(docValuesIdx, starts); + } + + public void close() throws IOException { + super.close(); + } + + public IndexDocValues reset(DocValuesIndex[] docValuesIdx) { + int[] start = new int[docValuesIdx.length]; + for (int i = 0; i < docValuesIdx.length; i++) { + start[i] = docValuesIdx[i].start; + } + this.starts = start; + this.docValuesIdx = docValuesIdx; + return this; + } + + public static class DummyDocValues extends IndexDocValues { + final int maxDoc; + final Source emptySoruce; + + public DummyDocValues(int maxDoc, ValueType type) { + this.maxDoc = maxDoc; + this.emptySoruce = new EmptySource(type); + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return emptySoruce.getEnum(attrSource); + } + + @Override + public Source load() throws IOException { + return emptySoruce; + } + + @Override + public ValueType type() { + return emptySoruce.type(); + } + + public void close() throws IOException { + super.close(); + } + + } + + private static class MultiValuesEnum extends ValuesEnum { + private DocValuesIndex[] docValuesIdx; + private final int maxDoc; + private int currentStart; + private int currentMax; + private int currentDoc = -1; + private ValuesEnum currentEnum; + private final int[] starts; + + public MultiValuesEnum(DocValuesIndex[] docValuesIdx, int[] starts) + throws IOException { + super(docValuesIdx[0].docValues.type()); + this.docValuesIdx = docValuesIdx; + final DocValuesIndex last = docValuesIdx[docValuesIdx.length - 1]; + maxDoc = last.start + last.length; + final DocValuesIndex idx = docValuesIdx[0]; + currentEnum = idx.docValues.getEnum(this.attributes()); + currentEnum.copyFrom(this); + intsRef = currentEnum.intsRef; + currentMax = idx.length; + currentStart = 0; + this.starts = starts; + } + + @Override + public void close() throws IOException { + currentEnum.close(); + } + + @Override + public int advance(int target) throws IOException { + assert target > currentDoc : "target " + target + + " must be > than the current doc " + currentDoc; + int relativeDoc = target - currentStart; + do { + if (target >= maxDoc) {// we are beyond max doc + return currentDoc = NO_MORE_DOCS; + } + if (target >= currentMax) { + final int idx = ReaderUtil.subIndex(target, starts); + currentEnum.close(); + currentEnum = docValuesIdx[idx].docValues.getEnum(); + currentEnum.copyFrom(this); + currentStart = docValuesIdx[idx].start; + currentMax = currentStart + docValuesIdx[idx].length; + relativeDoc = target - currentStart; + } + target = currentMax; // make sure that we advance to the next enum if the current is exhausted + + } while ((relativeDoc = currentEnum.advance(relativeDoc)) == NO_MORE_DOCS); + return currentDoc = currentStart + relativeDoc; + } + + @Override + public int docID() { + return currentDoc; + } + + @Override + public int nextDoc() throws IOException { + return advance(currentDoc + 1); + } + } + + private static class MultiSource extends Source { + private int numDocs = 0; + private int start = 0; + private Source current; + private final int[] starts; + private final DocValuesIndex[] docValuesIdx; + + public MultiSource(DocValuesIndex[] docValuesIdx, int[] starts) { + this.docValuesIdx = docValuesIdx; + this.starts = starts; + assert docValuesIdx.length != 0; + + } + + public long getInt(int docID) { + final int doc = ensureSource(docID); + return current.getInt(doc); + } + + private final int ensureSource(int docID) { + if (docID >= start && docID < start+numDocs) { + return docID - start; + } else { + final int idx = ReaderUtil.subIndex(docID, starts); + assert idx >= 0 && idx < docValuesIdx.length : "idx was " + idx + + " for doc id: " + docID + " slices : " + Arrays.toString(starts); + assert docValuesIdx[idx] != null; + try { + current = docValuesIdx[idx].docValues.getSource(); + } catch (IOException e) { + throw new RuntimeException("load failed", e); // TODO how should we + // handle this + } + + start = docValuesIdx[idx].start; + numDocs = docValuesIdx[idx].length; + return docID - start; + } + } + + public double getFloat(int docID) { + final int doc = ensureSource(docID); + return current.getFloat(doc); + } + + public BytesRef getBytes(int docID, BytesRef bytesRef) { + final int doc = ensureSource(docID); + return current.getBytes(doc, bytesRef); + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + throw new UnsupportedOperationException(); // TODO + } + + @Override + public ValueType type() { + return docValuesIdx[0].docValues.type(); + } + + } + + private static class EmptySource extends Source { + private final ValueType type; + + public EmptySource(ValueType type) { + this.type = type; + } + + @Override + public BytesRef getBytes(int docID, BytesRef ref) { + ref.length = 0; + return ref; + + } + + @Override + public double getFloat(int docID) { + return 0d; + } + + @Override + public long getInt(int docID) { + return 0; + } + + @Override + public ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + return ValuesEnum.emptyEnum(type); + } + + @Override + public ValueType type() { + return type; + } + } + + @Override + public ValueType type() { + return this.docValuesIdx[0].docValues.type(); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/PerDocFieldValues.java docvalues/lucene/src/java/org/apache/lucene/index/values/PerDocFieldValues.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/PerDocFieldValues.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/PerDocFieldValues.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,101 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Comparator; + +import org.apache.lucene.document.IndexDocValuesField; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.util.BytesRef; + +/** + * Per document and field values consumed by {@link DocValuesConsumer}. + * @see IndexDocValuesField + * @see Fieldable#setDocValues(PerDocFieldValues) + * + * @lucene.experimental + */ +public interface PerDocFieldValues { + + /** + * Sets the given long value. + */ + public void setInt(long value); + + /** + * Sets the given float value. + */ + public void setFloat(float value); + + /** + * Sets the given double value. + */ + public void setFloat(double value); + + /** + * Sets the given {@link BytesRef} value and the field's {@link ValueType}. The + * comparator for this field is set to null. If a + * null comparator is set the default comparator for the given + * {@link ValueType} is used. + */ + public void setBytes(BytesRef value, ValueType type); + + /** + * Sets the given {@link BytesRef} value, the field's {@link ValueType} and the + * field's comparator. If the {@link Comparator} is set to null + * the default for the given {@link ValueType} is used instead. + */ + public void setBytes(BytesRef value, ValueType type, Comparator comp); + + /** + * Returns the set {@link BytesRef} or null if not set. + */ + public BytesRef getBytes(); + + /** + * Returns the set {@link BytesRef} comparator or null if not set + */ + public Comparator bytesComparator(); + + /** + * Returns the set floating point value or 0.0d if not set. + */ + public double getFloat(); + + /** + * Returns the set long value of 0 if not set. + */ + public long getInt(); + + /** + * Sets the {@link BytesRef} comparator for this field. If the field has a + * numeric {@link ValueType} the comparator will be ignored. + */ + public void setBytesComparator(Comparator comp); + + /** + * Sets the {@link ValueType} + */ + public void setType(ValueType type); + + /** + * Returns the {@link ValueType} + */ + public ValueType type(); + +} \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/SourceCache.java docvalues/lucene/src/java/org/apache/lucene/index/values/SourceCache.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/SourceCache.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/SourceCache.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,120 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.IndexDocValues.SortedSource; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.util.BytesRef; + +/** + * Abstract base class for {@link IndexDocValues} {@link Source} / + * {@link SortedSource} cache. + *

+ * {@link Source} and {@link SortedSource} instances loaded via + * {@link IndexDocValues#load()} and {@link IndexDocValues#loadSorted(Comparator)} are + * entirely memory resident and need to be maintained by the caller. Each call + * to {@link IndexDocValues#load()} or {@link IndexDocValues#loadSorted(Comparator)} will + * cause an entire reload of the underlying data. Source and + * {@link SortedSource} instances obtained from {@link IndexDocValues#getSource()} + * and {@link IndexDocValues#getSource()} respectively are maintained by a + * {@link SourceCache} that is closed ({@link #close(IndexDocValues)}) once the + * {@link IndexReader} that created the {@link IndexDocValues} instance is closed. + *

+ * Unless {@link Source} and {@link SortedSource} instances are managed by + * another entity it is recommended to use the cached variants to obtain a + * source instance. + *

+ * Implementation of this API must be thread-safe. + * + * @see IndexDocValues#setCache(SourceCache) + * @see IndexDocValues#getSource() + * @see IndexDocValues#getSortedSorted(Comparator) + * + * @lucene.experimental + */ +public abstract class SourceCache { + + /** + * Atomically loads a {@link Source} into the cache from the given + * {@link IndexDocValues} and returns it iff no other {@link Source} has already + * been cached. Otherwise the cached source is returned. + *

+ * This method will not return null + */ + public abstract Source load(IndexDocValues values) throws IOException; + + /** + * Atomically loads a {@link SortedSource} into the cache from the given + * {@link IndexDocValues} and returns it iff no other {@link SortedSource} has + * already been cached. Otherwise the cached source is returned. + *

+ * This method will not return null + */ + public abstract SortedSource loadSorted(IndexDocValues values, + Comparator comp) throws IOException; + + /** + * Atomically invalidates the cached {@link Source} and {@link SortedSource} + * instances if any and empties the cache. + */ + public abstract void invalidate(IndexDocValues values); + + /** + * Atomically closes the cache and frees all resources. + */ + public synchronized void close(IndexDocValues values) { + invalidate(values); + } + + /** + * Simple per {@link IndexDocValues} instance cache implementation that holds a + * {@link Source} and {@link SortedSource} reference as a member variable. + *

+ * If a {@link DirectSourceCache} instance is closed or invalidated the cached + * reference are simply set to null + */ + public static final class DirectSourceCache extends SourceCache { + private Source ref; + private SortedSource sortedRef; + + public synchronized Source load(IndexDocValues values) throws IOException { + if (ref == null) { + ref = values.load(); + } + return ref; + } + + public synchronized SortedSource loadSorted(IndexDocValues values, + Comparator comp) throws IOException { + if (sortedRef == null) { + sortedRef = values.loadSorted(comp); + } + return sortedRef; + } + + public synchronized void invalidate(IndexDocValues values) { + ref = null; + sortedRef = null; + } + } + +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/ValueType.java docvalues/lucene/src/java/org/apache/lucene/index/values/ValueType.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/ValueType.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/ValueType.java 2011-06-03 22:42:55.000000000 +0200 @@ -0,0 +1,181 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.values.IndexDocValues.SortedSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; + +/** + * {@link ValueType} specifies the type of the {@link IndexDocValues} for a + * certain field. A {@link ValueType} only defines the data type for a field + * while the actual Implementation used to encode and decode the values depends + * on the field's {@link Codec}. It is up to the {@link Codec} implementing + * {@link PerDocConsumer#addValuesField(org.apache.lucene.index.FieldInfo)} and + * using a different low-level implementations to write the stored values for a + * field. + * + * @lucene.experimental + */ +public enum ValueType { + /* + * TODO: Add INT_32 INT_64 INT_16 & INT_8?! + */ + /** + * Defines an 64 bit integer value. By default this type uses a simple + * compression technique based on {@link PackedInts}. Internally only the used + * value range is encoded if it fits into 263-1. If that range is + * exceeded the default implementation falls back to fixed size 64bit + * integers. + *

+ * NOTE: this type uses 0 as the default value without any + * distinction between provided 0 values during indexing. All + * documents without an explicit value will use 0 instead. In turn, + * {@link ValuesEnum} instances will not skip documents without an explicit + * value assigned. Custom default values must be assigned explicitly. + *

+ */ + INTS, + + /** + * Defines a 32 bit floating point values. By default there is no compression + * applied. To fit custom float values into less than 32bit either a custom + * implementation is needed or values must be encoded into a + * {@link #BYTES_FIXED_STRAIGHT} type. + *

+ * NOTE: this type uses 0.0f as the default value without any + * distinction between provided 0.0f values during indexing. All + * documents without an explicit value will use 0.0f instead. In + * turn, {@link ValuesEnum} instances will not skip documents without an + * explicit value assigned. Custom default values must be assigned explicitly. + *

+ */ + FLOAT_32, + /** + * Defines a 64 bit floating point values. By default there is no compression + * applied. To fit custom float values into less than 64bit either a custom + * implementation is needed or values must be encoded into a + * {@link #BYTES_FIXED_STRAIGHT} type. + *

+ * NOTE: this type uses 0.0d as the default value without any + * distinction between provided 0.0d values during indexing. All + * documents without an explicit value will use 0.0d instead. In + * turn, {@link ValuesEnum} instances will not skip documents without an + * explicit value assigned. Custom default values must be assigned explicitly. + *

+ */ + FLOAT_64, + + // TODO(simonw): -- shouldn't lucene decide/detect straight vs + // deref, as well fixed vs var? + /** + * Defines a fixed length straight stored byte variant. All values added to + * such a field must be of the same length. All bytes are stored sequentially + * for fast offset access. + *

+ * NOTE: this type uses 0-bytes based on the length of the first seen + * values as the default value without any distinction between explicitly + * provided values during indexing. All documents without an explicit value + * will use the default instead. In turn, {@link ValuesEnum} instances will + * not skip documents without an explicit value assigned. Custom default + * values must be assigned explicitly. + *

+ */ + BYTES_FIXED_STRAIGHT, + + /** + * Defines a fixed length dereferenced (indexed) byte variant. Fields with + * this type only store distinct byte values and store an additional offset + * pointer per document to dereference the payload. + *

+ * NOTE: Fields of this type will not store values for documents without and + * explicitly provided value. If a documents value is accessed while no + * explicit value is stored the returned {@link BytesRef} will be a 0-length + * reference. In turn, {@link ValuesEnum} instances will skip over documents + * without an explicit value assigned. Custom default values must be assigned + * explicitly. + *

+ */ + BYTES_FIXED_DEREF, + + /** + * Defines a fixed length pre-sorted byte variant. Fields with this type only + * store distinct byte values and store an additional offset pointer per + * document to dereference the payload. The stored byte payload is presorted + * and allows access via document id, ordinal and by-value. + *

+ * NOTE: Fields of this type will not store values for documents without and + * explicitly provided value. If a documents value is accessed while no + * explicit value is stored the returned {@link BytesRef} will be a 0-length + * reference. In turn, {@link ValuesEnum} instances will skip over documents + * without an explicit value assigned. Custom default values must be assigned + * explicitly. + *

+ * + * @see SortedSource + */ + BYTES_FIXED_SORTED, + + /** + * Defines a variable length straight stored byte variant. All bytes are + * stored sequentially for compactness. Usage of this type via the + * disk-resident API might yield performance degradation since no additional + * index is used to advance by more than one documents value at a time. + *

+ * NOTE: Fields of this type will not store values for documents without and + * explicitly provided value. If a documents value is accessed while no + * explicit value is stored the returned {@link BytesRef} will be a 0-length + * reference. Yet, in contrast to dereferences variants {@link ValuesEnum} + * instances will not skip over documents without an explicit value + * assigned. Custom default values must be assigned explicitly. + *

+ */ + BYTES_VAR_STRAIGHT, + + /** + * Defines a variable length dereferenced (indexed) byte variant. Just as + * {@link #BYTES_FIXED_DEREF} yet supporting variable length values. + *

+ * NOTE: Fields of this type will not store values for documents without and + * explicitly provided value. If a documents value is accessed while no + * explicit value is stored the returned {@link BytesRef} will be a 0-length + * reference. In turn, {@link ValuesEnum} instances will skip over documents + * without an explicit value assigned. Custom default values must be assigned + * explicitly. + *

+ */ + BYTES_VAR_DEREF, + + /** + * Defines a variable length pre-sorted byte variant. Just as + * {@link #BYTES_FIXED_SORTED} yet supporting variable length values. + *

+ * NOTE: Fields of this type will not store values for documents without and + * explicitly provided value. If a documents value is accessed while no + * explicit value is stored the returned {@link BytesRef} will be a 0-length + * reference. In turn, {@link ValuesEnum} instances will skip over documents + * without an explicit value assigned. Custom default values must be assigned + * explicitly. + *

+ * + * @see SortedSource + */ + BYTES_VAR_SORTED +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java docvalues/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/ValuesEnum.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,173 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; + +/** + * {@link ValuesEnum} is a {@link DocIdSetIterator} iterating byte[] + * , long and double stored per document. Depending on the + * enum's {@link ValueType} ({@link #type()}) the enum might skip over documents that + * have no value stored. Types like {@link ValueType#BYTES_VAR_STRAIGHT} might not + * skip over documents even if there is no value associated with a document. The + * value for document without values again depends on the types implementation + * although a reference for a {@link ValueType} returned from a accessor method + * {@link #getFloat()}, {@link #getInt()} or {@link #bytes()} will never be + * null even if a document has no value. + *

+ * Note: Only the reference for the enum's type are initialized to non + * null ie. {@link #getInt()} will always return null + * if the enum's Type is {@link ValueType#FLOAT_32}. + * + * @lucene.experimental + */ +public abstract class ValuesEnum extends DocIdSetIterator { + private AttributeSource source; + private final ValueType enumType; + protected BytesRef bytesRef; + protected FloatsRef floatsRef; + protected LongsRef intsRef; + + /** + * Creates a new {@link ValuesEnum} for the given type. The + * {@link AttributeSource} for this enum is set to null + */ + protected ValuesEnum(ValueType enumType) { + this(null, enumType); + } + + /** + * Creates a new {@link ValuesEnum} for the given type. + */ + protected ValuesEnum(AttributeSource source, ValueType enumType) { + this.source = source; + this.enumType = enumType; + switch (enumType) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + bytesRef = new BytesRef(); + break; + case INTS: + intsRef = new LongsRef(1); + break; + case FLOAT_32: + case FLOAT_64: + floatsRef = new FloatsRef(1); + break; + } + } + + /** + * Returns the type of this enum + */ + public ValueType type() { + return enumType; + } + + /** + * Returns a {@link BytesRef} or null if this enum doesn't + * enumerate byte[] values + */ + public BytesRef bytes() { + return bytesRef; + } + + /** + * Returns a {@link FloatsRef} or null if this enum doesn't + * enumerate floating point values + */ + public FloatsRef getFloat() { + return floatsRef; + } + + /** + * Returns a {@link LongsRef} or null if this enum doesn't + * enumerate integer values. + */ + public LongsRef getInt() { + return intsRef; + } + + /** + * Copies the internal state from the given enum + */ + protected void copyFrom(ValuesEnum valuesEnum) { + intsRef = valuesEnum.intsRef; + floatsRef = valuesEnum.floatsRef; + bytesRef = valuesEnum.bytesRef; + source = valuesEnum.source; + } + + /** + * Returns the {@link AttributeSource} associated with this enum. + *

+ * Note: this method might create a new AttribueSource if no + * {@link AttributeSource} has been provided during enum creation. + */ + public AttributeSource attributes() { + if (source == null) { + source = new AttributeSource(); + } + return source; + } + + /** + * Closes the enum + * + * @throws IOException + * if an {@link IOException} occurs + */ + public abstract void close() throws IOException; + + /** + * Returns an empty {@link ValuesEnum} for the given {@link ValueType}. + */ + public static ValuesEnum emptyEnum(ValueType type) { + return new ValuesEnum(type) { + @Override + public int nextDoc() throws IOException { + return NO_MORE_DOCS; + } + + @Override + public int docID() { + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + return NO_MORE_DOCS; + } + + @Override + public void close() throws IOException { + + } + }; + } + +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,287 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; +import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; +import org.apache.lucene.util.packed.PackedInts; + +// Stores variable-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] and both +// docs reference that single source + +/** + * @lucene.experimental + */ +class VarDerefBytesImpl { + + static final String CODEC_NAME = "VarDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private static final class AddressByteStartArray extends + TrackingDirectBytesStartArray { + int[] address; + + AddressByteStartArray(int size, AtomicLong bytesUsed) { + super(size, bytesUsed); + } + + @Override + public AtomicLong bytesUsed() { + return bytesUsed; + } + + @Override + public int[] clear() { + if (address != null) { + bytesUsed.addAndGet(-address.length * RamUsageEstimator.NUM_BYTES_INT); + address = null; + } + return super.clear(); + } + + @Override + public int[] grow() { + assert address != null; + final int oldSize = address.length; + final int[] retVal = super.grow(); + address = ArrayUtil.grow(address, retVal.length); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT + * (address.length - oldSize)); + return retVal; + } + + @Override + public int[] init() { + if (address == null) { + address = new int[ArrayUtil.oversize(initSize, + RamUsageEstimator.NUM_BYTES_INT)]; + bytesUsed.addAndGet((address.length) * RamUsageEstimator.NUM_BYTES_INT); + } + return super.init(); + } + + } + + /* + * TODO: if impls like this are merged we are bound to the amount of memory we + * can store into a BytesRefHash and therefore how much memory a ByteBlockPool + * can address. This is currently limited to 2GB. While we could extend that + * and use 64bit for addressing this still limits us to the existing main + * memory as all distinct bytes will be loaded up into main memory. We could + * move the byte[] writing to #finish(int) and store the bytes in sorted + * order and merge them in a streamed fashion. + */ + static class Writer extends BytesWriterBase { + private int[] docToAddress; + private int address = 1; + + private final AddressByteStartArray array = new AddressByteStartArray(1, + bytesUsed); + private final BytesRefHash hash = new BytesRefHash(pool, 16, array); + + public Writer(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + this(dir, id, new DirectTrackingAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), + bytesUsed); + } + + public Writer(Directory dir, String id, Allocator allocator, + AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true, + new ByteBlockPool(allocator), bytesUsed); + docToAddress = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) + return; // default + final int e = hash.add(bytes); + + if (docID >= docToAddress.length) { + final int oldSize = docToAddress.length; + docToAddress = ArrayUtil.grow(docToAddress, 1 + docID); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT + * (docToAddress.length - oldSize)); + } + final int docAddress; + if (e >= 0) { + docAddress = array.address[e] = address; + address += writePrefixLength(datOut, bytes); + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } else { + docAddress = array.address[(-e) - 1]; + } + docToAddress[docID] = docAddress; + } + + private static int writePrefixLength(DataOutput datOut, BytesRef bytes) + throws IOException { + if (bytes.length < 128) { + datOut.writeByte((byte) bytes.length); + return 1; + } else { + datOut.writeByte((byte) (0x80 | (bytes.length >> 8))); + datOut.writeByte((byte) (bytes.length & 0xff)); + return 2; + } + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + try { + idxOut.writeInt(address - 1); + // write index + // TODO(simonw): -- allow forcing fixed array (not -1) + // TODO(simonw): check the address calculation / make it more intuitive + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(address - 1)); + final int limit; + if (docCount > docToAddress.length) { + limit = docToAddress.length; + } else { + limit = docCount; + } + for (int i = 0; i < limit; i++) { + w.add(docToAddress[i]); + } + for (int i = limit; i < docCount; i++) { + w.add(0); + } + w.finish(); + } finally { + hash.close(); + super.finish(docCount); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT + * (-docToAddress.length)); + docToAddress = null; + } + } + } + + public static class Reader extends BytesReaderBase { + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + } + + @Override + public Source load() throws IOException { + final IndexInput data = cloneData(); + final IndexInput index = cloneIndex(); + data.seek(CodecUtil.headerLength(CODEC_NAME)); + index.seek(CodecUtil.headerLength(CODEC_NAME)); + final long totalBytes = index.readInt(); // should be long + return new Source(data, index, totalBytes); + } + + private static class Source extends BytesBaseSource { + private final PackedInts.Reader index; + + public Source(IndexInput datIn, IndexInput idxIn, long totalBytes) + throws IOException { + super(datIn, idxIn, new PagedBytes(PAGED_BYTES_BITS), totalBytes); + index = PackedInts.getReader(idxIn); + } + + @Override + public BytesRef getBytes(int docID, BytesRef bytesRef) { + long address = index.get(docID); + bytesRef.length = 0; + return address == 0 ? bytesRef : data.fillSliceWithPrefix(bytesRef, + --address); + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_DEREF; + } + + @Override + protected int maxDoc() { + return index.size(); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarDerefBytesEnum(source, cloneData(), cloneIndex()); + } + + static class VarDerefBytesEnum extends DerefBytesEnum { + + public VarDerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, datIn, idxIn, -1, ValueType.BYTES_VAR_DEREF); + } + + @Override + protected void fill(long address, BytesRef ref) throws IOException { + datIn.seek(fp + --address); + final byte sizeByte = datIn.readByte(); + final int size; + if ((sizeByte & 128) == 0) { + // length is 1 byte + size = sizeByte; + } else { + size = ((sizeByte & 0x7f) << 8) | ((datIn.readByte() & 0xff)); + } + if (ref.bytes.length < size) + ref.grow(size); + ref.length = size; + ref.offset = 0; + datIn.readBytes(ref.bytes, 0, size); + } + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_DEREF; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,315 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ByteBlockPool.Allocator; +import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; +import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; +import org.apache.lucene.util.packed.PackedInts; + +// Stores variable-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] and both +// docs reference that single source + +/** + * @lucene.experimental + */ +class VarSortedBytesImpl { + + static final String CODEC_NAME = "VarDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int[] docToEntry; + private final Comparator comp; + + private final BytesRefHash hash = new BytesRefHash(pool, + BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray( + BytesRefHash.DEFAULT_CAPACITY, bytesUsed)); + + public Writer(Directory dir, String id, Comparator comp, + AtomicLong bytesUsed) throws IOException { + this(dir, id, comp, new DirectTrackingAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), + bytesUsed); + } + + public Writer(Directory dir, String id, Comparator comp, + Allocator allocator, AtomicLong bytesUsed) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true, + new ByteBlockPool(allocator), bytesUsed); + this.comp = comp; + docToEntry = new int[1]; + docToEntry[0] = -1; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) + return;// default + if (docID >= docToEntry.length) { + int[] newArray = new int[ArrayUtil.oversize(1 + docID, + RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + Arrays.fill(newArray, docToEntry.length, newArray.length, -1); + bytesUsed.addAndGet((newArray.length - docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + docToEntry = newArray; + } + final int e = hash.add(bytes); + docToEntry[docID] = e < 0 ? (-e) - 1 : e; + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + public void finish(int docCount) throws IOException { + final int count = hash.size(); + try { + final int[] sortedEntries = hash.sort(comp); + // first dump bytes data, recording index & offset as + // we go + long offset = 0; + long lastOffset = 0; + final int[] index = new int[count]; + final long[] offsets = new long[count]; + for (int i = 0; i < count; i++) { + final int e = sortedEntries[i]; + offsets[i] = offset; + index[e] = 1 + i; + + final BytesRef bytes = hash.get(e, new BytesRef()); + // TODO: we could prefix code... + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + lastOffset = offset; + offset += bytes.length; + } + + // total bytes of data + idxOut.writeLong(offset); + + // write index -- first doc -> 1+ord + // TODO(simonw): allow not -1: + final PackedInts.Writer indexWriter = PackedInts.getWriter(idxOut, + docCount, PackedInts.bitsRequired(count)); + final int limit = docCount > docToEntry.length ? docToEntry.length + : docCount; + for (int i = 0; i < limit; i++) { + final int e = docToEntry[i]; + indexWriter.add(e == -1 ? 0 : index[e]); + } + for (int i = limit; i < docCount; i++) { + indexWriter.add(0); + } + indexWriter.finish(); + + // next ord (0-based) -> offset + // TODO(simonw): -- allow not -1: + PackedInts.Writer offsetWriter = PackedInts.getWriter(idxOut, count, + PackedInts.bitsRequired(lastOffset)); + for (int i = 0; i < count; i++) { + offsetWriter.add(offsets[i]); + } + offsetWriter.finish(); + } finally { + super.finish(docCount); + bytesUsed.addAndGet((-docToEntry.length) + * RamUsageEstimator.NUM_BYTES_INT); + hash.close(); + } + } + } + + public static class Reader extends BytesReaderBase { + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + } + + @Override + public org.apache.lucene.index.values.IndexDocValues.Source load() + throws IOException { + return loadSorted(null); + } + + @Override + public SortedSource loadSorted(Comparator comp) + throws IOException { + IndexInput indexIn = cloneIndex(); + return new Source(cloneData(), indexIn, comp, indexIn.readLong()); + } + + private static class Source extends BytesBaseSortedSource { + private final PackedInts.Reader docToOrdIndex; + private final PackedInts.Reader ordToOffsetIndex; // 0-based + private final long totBytes; + private final int valueCount; + + public Source(IndexInput datIn, IndexInput idxIn, + Comparator comp, long dataLength) throws IOException { + super(datIn, idxIn, comp, new PagedBytes(PAGED_BYTES_BITS), dataLength); + totBytes = dataLength; + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + closeIndexInput(); + } + + @Override + public int ord(int docID) { + return (int) docToOrdIndex.get(docID) - 1; + } + + @Override + public int getByValue(BytesRef bytes, BytesRef tmpRef) { + return binarySearch(bytes, tmpRef, 0, valueCount - 1); + } + + @Override + public int getValueCount() { + return valueCount; + } + + // ord is 0-based + @Override + protected BytesRef deref(int ord, BytesRef bytesRef) { + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + final long offset = ordToOffsetIndex.get(ord); + data.fillSlice(bytesRef, offset, (int) (nextOffset - offset)); + return bytesRef; + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_SORTED; + } + + @Override + protected int maxDoc() { + return docToOrdIndex.size(); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarSortedBytesEnum(source, cloneData(), cloneIndex()); + } + + private static class VarSortedBytesEnum extends ValuesEnum { + private PackedInts.Reader docToOrdIndex; + private PackedInts.Reader ordToOffsetIndex; + private IndexInput idxIn; + private IndexInput datIn; + private int valueCount; + private long totBytes; + private int docCount; + private int pos = -1; + private final long fp; + + protected VarSortedBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, ValueType.BYTES_VAR_SORTED); + totBytes = idxIn.readLong(); + // keep that in memory to prevent lots of disk seeks + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + docCount = docToOrdIndex.size(); + fp = datIn.getFilePointer(); + this.idxIn = idxIn; + this.datIn = datIn; + } + + @Override + public void close() throws IOException { + idxIn.close(); + datIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= docCount) { + return pos = NO_MORE_DOCS; + } + int ord; + while ((ord = (int) docToOrdIndex.get(target)) == 0) { + if (++target >= docCount) { + return pos = NO_MORE_DOCS; + } + } + final long offset = ordToOffsetIndex.get(--ord); + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + final int length = (int) (nextOffset - offset); + datIn.seek(fp + offset); + if (bytesRef.bytes.length < length) + bytesRef.grow(length); + datIn.readBytes(bytesRef.bytes, 0, length); + bytesRef.length = length; + bytesRef.offset = 0; + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + if (pos >= docCount) { + return pos = NO_MORE_DOCS; + } + return advance(pos + 1); + } + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_SORTED; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java docvalues/lucene/src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,233 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +// Variable length byte[] per document, no sharing + +/** + * @lucene.experimental + */ +class VarStraightBytesImpl { + + static final String CODEC_NAME = "VarStraightBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private long address; + // start at -1 if the first added value is > 0 + private int lastDocID = -1; + private long[] docToAddress; + + public Writer(Directory dir, String id, AtomicLong bytesUsed) + throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, true, null, bytesUsed); + docToAddress = new long[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); + } + + public Writer(Directory dir, String id) throws IOException { + this(dir, id, new AtomicLong()); + } + + // Fills up to but not including this docID + private void fill(final int docID) { + if (docID >= docToAddress.length) { + int oldSize = docToAddress.length; + docToAddress = ArrayUtil.grow(docToAddress, 1 + docID); + bytesUsed.addAndGet((docToAddress.length - oldSize) + * RamUsageEstimator.NUM_BYTES_INT); + } + for (int i = lastDocID + 1; i < docID; i++) { + docToAddress[i] = address; + } + lastDocID = docID; + } + + @Override + public void add(int docID, BytesRef bytes) throws IOException { + if (bytes.length == 0) + return; // default + fill(docID); + docToAddress[docID] = address; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } + + @Override + public void finish(int docCount) throws IOException { + try { + if (lastDocID == -1) { + idxOut.writeVLong(0); + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(0)); + for (int i = 0; i < docCount; i++) { + w.add(0); + } + w.finish(); + } else { + fill(docCount); + idxOut.writeVLong(address); + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(address)); + for (int i = 0; i < docCount; i++) { + w.add(docToAddress[i]); + } + w.finish(); + } + } finally { + bytesUsed.addAndGet(-(docToAddress.length) + * RamUsageEstimator.NUM_BYTES_INT); + docToAddress = null; + super.finish(docCount); + } + } + + public long ramBytesUsed() { + return bytesUsed.get(); + } + } + + public static class Reader extends BytesReaderBase { + private final int maxDoc; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + this.maxDoc = maxDoc; + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), cloneIndex()); + } + + private class Source extends BytesBaseSource { + private final PackedInts.Reader addresses; + + public Source(IndexInput datIn, IndexInput idxIn) throws IOException { + super(datIn, idxIn, new PagedBytes(PAGED_BYTES_BITS), idxIn.readVLong()); + addresses = PackedInts.getReader(idxIn); + } + + @Override + public BytesRef getBytes(int docID, BytesRef bytesRef) { + final long address = addresses.get(docID); + final int length = docID == maxDoc - 1 ? (int) (totalLengthInBytes - address) + : (int) (addresses.get(1 + docID) - address); + return data.fillSlice(bytesRef, address, length); + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_STRAIGHT; + } + + @Override + protected int maxDoc() { + return addresses.size(); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarStraightBytesEnum(source, cloneData(), cloneIndex()); + } + + private class VarStraightBytesEnum extends ValuesEnum { + private final PackedInts.Reader addresses; + private final IndexInput datIn; + private final IndexInput idxIn; + private final long fp; + private final long totBytes; + private int pos = -1; + + protected VarStraightBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, ValueType.BYTES_VAR_STRAIGHT); + totBytes = idxIn.readVLong(); + fp = datIn.getFilePointer(); + addresses = PackedInts.getReader(idxIn); + this.datIn = datIn; + this.idxIn = idxIn; + } + + @Override + public void close() throws IOException { + datIn.close(); + idxIn.close(); + } + + @Override + public int advance(final int target) throws IOException { + if (target >= maxDoc) { + return pos = NO_MORE_DOCS; + } + final long addr = addresses.get(target); + if (addr == totBytes) { // empty values at the end + bytesRef.length = 0; + bytesRef.offset = 0; + return pos = target; + } + datIn.seek(fp + addr); + final int size = (int) (target == maxDoc - 1 ? totBytes - addr + : addresses.get(target + 1) - addr); + if (bytesRef.bytes.length < size) { + bytesRef.grow(size); + } + bytesRef.length = size; + datIn.readBytes(bytesRef.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + + @Override + public ValueType type() { + return ValueType.BYTES_VAR_STRAIGHT; + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/index/values/Writer.java docvalues/lucene/src/java/org/apache/lucene/index/values/Writer.java --- trunk_2/lucene/src/java/org/apache/lucene/index/values/Writer.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/index/values/Writer.java 2011-06-03 22:42:57.000000000 +0200 @@ -0,0 +1,228 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** + * Abstract API for per-document stored primitive values of type byte[] + * , long or double. The API accepts a single value for each + * document. The underlying storage mechanism, file formats, data-structures and + * representations depend on the actual implementation. + *

+ * Document IDs passed to this API must always be increasing unless stated + * otherwise. + *

+ * + * @lucene.experimental + */ +public abstract class Writer extends DocValuesConsumer { + + /** + * Creates a new {@link Writer}. + * + * @param bytesUsed + * bytes-usage tracking reference used by implementation to track + * internally allocated memory. All tracked bytes must be released + * once {@link #finish(int)} has been called. + */ + protected Writer(AtomicLong bytesUsed) { + super(bytesUsed); + } + + /** + * Filename extension for index files + */ + public static final String INDEX_EXTENSION = "idx"; + + /** + * Filename extension for data files. + */ + public static final String DATA_EXTENSION = "dat"; + + /** + * Records the specified long value for the docID or throws an + * {@link UnsupportedOperationException} if this {@link Writer} doesn't record + * long values. + * + * @throws UnsupportedOperationException + * if this writer doesn't record long values + */ + public void add(int docID, long value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Records the specified double value for the docID or throws an + * {@link UnsupportedOperationException} if this {@link Writer} doesn't record + * double values. + * + * @throws UnsupportedOperationException + * if this writer doesn't record double values + */ + public void add(int docID, double value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Records the specified {@link BytesRef} value for the docID or throws an + * {@link UnsupportedOperationException} if this {@link Writer} doesn't record + * {@link BytesRef} values. + * + * @throws UnsupportedOperationException + * if this writer doesn't record {@link BytesRef} values + */ + public void add(int docID, BytesRef value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Records a value from the given document id. The methods implementation + * obtains the value for the document id from the last {@link ValuesEnum} + * set to {@link #setNextEnum(ValuesEnum)}. + *

+ * This method is used during merging to provide implementation agnostic + * default merge implementation. + *

+ *

+ * The given document id must be the same document id returned from + * {@link ValuesEnum#docID()} when this method is called. All documents IDs + * between the given ID and the previously given ID or 0 if the + * method is call the first time are filled with default values depending on + * the {@link Writer} implementation. The given document ID must always be + * greater than the previous ID or 0 if called the first time. + */ + protected abstract void add(int docID) throws IOException; + + /** + * Sets the next {@link ValuesEnum} to consume values from on calls to + * {@link #add(int)} + * + * @param valuesEnum + * the next {@link ValuesEnum}, this must not be null + */ + protected abstract void setNextEnum(ValuesEnum valuesEnum); + + /** + * Finish writing and close any files and resources used by this Writer. + * + * @param docCount + * the total number of documents for this writer. This must be + * greater that or equal to the largest document id passed to one of + * the add methods after the {@link Writer} was created. + */ + public abstract void finish(int docCount) throws IOException; + + @Override + protected void merge(MergeState state) throws IOException { + // This enables bulk copies in subclasses per MergeState, subclasses can + // simply override this and decide if they want to merge + // segments using this generic implementation or if a bulk merge is possible + // / feasible. + final ValuesEnum valEnum = state.reader.getEnum(); + assert valEnum != null; + try { + setNextEnum(valEnum); // set the current enum we are working on - the + // impl. will get the correct reference for the type + // it supports + int docID = state.docBase; + final Bits bits = state.bits; + final int docCount = state.docCount; + int currentDocId; + if ((currentDocId = valEnum.advance(0)) != ValuesEnum.NO_MORE_DOCS) { + for (int i = 0; i < docCount; i++) { + if (bits == null || !bits.get(i)) { + if (currentDocId < i) { + if ((currentDocId = valEnum.advance(i)) == ValuesEnum.NO_MORE_DOCS) { + break; // advance can jump over default values + } + } + if (currentDocId == i) { // we are on the doc to merge + add(docID); + } + ++docID; + } + } + } + } finally { + valEnum.close(); + } + } + + /** + * Factory method to create a {@link Writer} instance for a given type. This + * method returns default implementations for each of the different types + * defined in the {@link ValueType} enumeration. + * + * @param type + * the {@link ValueType} to create the {@link Writer} for + * @param id + * the file name id used to create files within the writer. + * @param directory + * the {@link Directory} to create the files from. + * @param comp + * a {@link BytesRef} comparator used for {@link Bytes} variants. If + * null + * {@link BytesRef#getUTF8SortedAsUnicodeComparator()} is used as the + * default. + * @param bytesUsed + * a byte-usage tracking reference + * @return a new {@link Writer} instance for the given {@link ValueType} + * @throws IOException + */ + public static Writer create(ValueType type, String id, Directory directory, + Comparator comp, AtomicLong bytesUsed) throws IOException { + if (comp == null) { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); + } + switch (type) { + case INTS: + return Ints.getWriter(directory, id, true, bytesUsed); + case FLOAT_32: + return Floats.getWriter(directory, id, 4, bytesUsed); + case FLOAT_64: + return Floats.getWriter(directory, id, 8, bytesUsed); + case BYTES_FIXED_STRAIGHT: + return Bytes.getWriter(directory, id, Bytes.Mode.STRAIGHT, comp, true, + bytesUsed); + case BYTES_FIXED_DEREF: + return Bytes.getWriter(directory, id, Bytes.Mode.DEREF, comp, true, + bytesUsed); + case BYTES_FIXED_SORTED: + return Bytes.getWriter(directory, id, Bytes.Mode.SORTED, comp, true, + bytesUsed); + case BYTES_VAR_STRAIGHT: + return Bytes.getWriter(directory, id, Bytes.Mode.STRAIGHT, comp, false, + bytesUsed); + case BYTES_VAR_DEREF: + return Bytes.getWriter(directory, id, Bytes.Mode.DEREF, comp, false, + bytesUsed); + case BYTES_VAR_SORTED: + return Bytes.getWriter(directory, id, Bytes.Mode.SORTED, comp, false, + bytesUsed); + default: + throw new IllegalArgumentException("Unknown Values: " + type); + } + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java docvalues/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java --- trunk_2/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java 2010-10-23 09:44:04.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java 2010-10-26 11:39:44.000000000 +0200 @@ -32,7 +32,7 @@ /** * A QueryParser which constructs queries to search multiple fields. * - * @version $Revision: 1026489 $ + * @version $Revision: 1027396 $ */ public class MultiFieldQueryParser extends QueryParser { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/search/FieldComparator.java docvalues/lucene/src/java/org/apache/lucene/search/FieldComparator.java --- trunk_2/lucene/src/java/org/apache/lucene/search/FieldComparator.java 2011-02-28 11:48:56.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/search/FieldComparator.java 2011-06-03 22:42:59.000000000 +0200 @@ -20,8 +20,10 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.FieldCache.DocTermsIndex; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.IndexDocValues.Source; import org.apache.lucene.search.FieldCache.DocTerms; +import org.apache.lucene.search.FieldCache.DocTermsIndex; import org.apache.lucene.search.cache.ByteValuesCreator; import org.apache.lucene.search.cache.CachedArray; import org.apache.lucene.search.cache.CachedArrayCreator; @@ -38,9 +40,9 @@ import org.apache.lucene.search.cache.CachedArray.ShortValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; +import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.PackedInts; /** @@ -157,7 +159,6 @@ * comparators can just return "this" to reuse the same * comparator across segments * @throws IOException - * @throws IOException */ public abstract FieldComparator setNextReader(AtomicReaderContext context) throws IOException; @@ -328,6 +329,70 @@ } } + /** Uses float index values to sort by ascending value */ + public static final class FloatDocValuesComparator extends FieldComparator { + private final double[] values; + private Source currentReaderValues; + private final String field; + private double bottom; + private final float missingValue; + + FloatDocValuesComparator(int numHits, String field, Float missingValue) { + values = new double[numHits]; + this.field = field; + this.missingValue = missingValue == null ? 0 : missingValue.floatValue(); + } + + @Override + public int compare(int slot1, int slot2) { + final double v1 = values[slot1]; + final double v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + final double v2 = currentReaderValues.getFloat(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.getFloat(doc); + } + + @Override + public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { + final IndexDocValues docValues = context.reader.docValues(field); + if (docValues != null) { + currentReaderValues = docValues.getSource(); + } + return this; + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Double.valueOf(values[slot]); + } + } + /** Parses field's values as float (using {@link * FieldCache#getFloats} and sorts by ascending value */ public static final class FloatComparator extends NumericComparator { @@ -536,6 +601,74 @@ } } + /** Loads int index values and sorts by ascending value. */ + public static final class IntDocValuesComparator extends FieldComparator { + private final long[] values; + private Source currentReaderValues; + private final String field; + private long bottom; + private int missingValue; + + IntDocValuesComparator(int numHits, String field, Integer missingValue) { + values = new long[numHits]; + this.field = field; + this.missingValue = missingValue == null ? 0 : missingValue.intValue(); + } + + @Override + public int compare(int slot1, int slot2) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v1 = values[slot1]; + final long v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v2 = currentReaderValues.getInt(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.getInt(doc); + } + + @Override + public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { + IndexDocValues docValues = context.reader.docValues(field); + if (docValues != null) { + currentReaderValues = docValues.getSource(); + } + return this; + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Long.valueOf(values[slot]); + } + } + /** Parses field's values as long (using {@link * FieldCache#getLongs} and sorts by ascending value */ public static final class LongComparator extends NumericComparator { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/search/ReqExclScorer.java docvalues/lucene/src/java/org/apache/lucene/search/ReqExclScorer.java --- trunk_2/lucene/src/java/org/apache/lucene/search/ReqExclScorer.java 2011-02-01 17:08:42.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/search/ReqExclScorer.java 2011-02-09 11:13:33.000000000 +0100 @@ -23,7 +23,7 @@ /** A Scorer for queries with a required subscorer * and an excluding (prohibited) sub DocIdSetIterator. *
- * This Scorer implements {@link Scorer#skipTo(int)}, + * This Scorer implements {@link Scorer#advance(int)}, * and it uses the skipTo() on the given scorers. */ class ReqExclScorer extends Scorer { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/search/ReqOptSumScorer.java docvalues/lucene/src/java/org/apache/lucene/search/ReqOptSumScorer.java --- trunk_2/lucene/src/java/org/apache/lucene/search/ReqOptSumScorer.java 2011-02-01 17:08:42.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/search/ReqOptSumScorer.java 2011-02-09 11:13:33.000000000 +0100 @@ -21,7 +21,7 @@ /** A Scorer for queries with a required part and an optional part. * Delays skipTo() on the optional part until a score() is needed. *
- * This Scorer implements {@link Scorer#skipTo(int)}. + * This Scorer implements {@link Scorer#advance(int)}. */ class ReqOptSumScorer extends Scorer { /** The scorers passed from the constructor. diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/search/SortField.java docvalues/lucene/src/java/org/apache/lucene/search/SortField.java --- trunk_2/lucene/src/java/org/apache/lucene/search/SortField.java 2011-03-23 18:54:06.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/search/SortField.java 2011-05-17 16:46:37.000000000 +0200 @@ -18,9 +18,15 @@ */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.search.cache.*; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRef; + +// TODO(simonw) -- for cleaner transition, maybe we should make +// a new SortField that subclasses this one and always uses +// index values? /** * Stores information about how to sort documents by terms in an individual @@ -81,6 +87,9 @@ * uses ordinals to do the sorting. */ public static final int STRING_VAL = 11; + /** Sort use byte[] index values. */ + public static final int BYTES = 12; + /** Represents sorting by document score (relevance). */ public static final SortField FIELD_SCORE = new SortField (null, SCORE); @@ -390,6 +399,26 @@ return hash; } + private boolean useIndexValues; + + public void setUseIndexValues(boolean b) { + useIndexValues = b; + } + + public boolean getUseIndexValues() { + return useIndexValues; + } + + private Comparator bytesComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); + + public void setBytesComparator(Comparator b) { + bytesComparator = b; + } + + public Comparator getBytesComparator() { + return bytesComparator; + } + /** Returns the {@link FieldComparator} to use for * sorting. * @@ -412,10 +441,18 @@ return new FieldComparator.DocComparator(numHits); case SortField.INT: - return new FieldComparator.IntComparator(numHits, (IntValuesCreator)creator, (Integer)missingValue ); + if (useIndexValues) { + return new FieldComparator.IntDocValuesComparator(numHits, field, (Integer) missingValue); + } else { + return new FieldComparator.IntComparator(numHits, (IntValuesCreator)creator, (Integer) missingValue); + } case SortField.FLOAT: - return new FieldComparator.FloatComparator(numHits, (FloatValuesCreator)creator, (Float)missingValue ); + if (useIndexValues) { + return new FieldComparator.FloatDocValuesComparator(numHits, field, (Float) missingValue); + } else { + return new FieldComparator.FloatComparator(numHits, (FloatValuesCreator) creator, (Float) missingValue); + } case SortField.LONG: return new FieldComparator.LongComparator(numHits, (LongValuesCreator)creator, (Long)missingValue ); diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java docvalues/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java --- trunk_2/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/search/function/NumericIndexDocValueSource.java 2011-06-03 22:42:59.000000000 +0200 @@ -0,0 +1,114 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.values.IndexDocValues; +import org.apache.lucene.index.values.ValueType; + +/** + * Expert: obtains numeric field values from a {@link IndexDocValues} field. + * This {@link ValueSource} is compatible with all numerical + * {@link IndexDocValues} + * + * @lucene.experimental + * + */ +public class NumericIndexDocValueSource extends ValueSource { + + private final String field; + + public NumericIndexDocValueSource(String field) { + this.field = field; + } + + @Override + public DocValues getValues(AtomicReaderContext context) throws IOException { + final IndexDocValues.Source source = context.reader.docValues(field) + .getSource(); + ValueType type = source.type(); + switch (type) { + case FLOAT_32: + case FLOAT_64: + return new DocValues() { + + @Override + public String toString(int doc) { + return "float: [" + floatVal(doc) + "]"; + } + + @Override + public float floatVal(int doc) { + return (float) source.getFloat(doc); + } + }; + + case INTS: + return new DocValues() { + @Override + public String toString(int doc) { + return "float: [" + floatVal(doc) + "]"; + } + + @Override + public float floatVal(int doc) { + return (float) source.getInt(doc); + } + }; + default: + throw new IOException("Type: " + type + "is not numeric"); + } + + } + + @Override + public String description() { + return toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((field == null) ? 0 : field.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + NumericIndexDocValueSource other = (NumericIndexDocValueSource) obj; + if (field == null) { + if (other.field != null) + return false; + } else if (!field.equals(other.field)) + return false; + return true; + } + + @Override + public String toString() { + return "DocValues float(" + field + ')'; + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/ArrayUtil.java docvalues/lucene/src/java/org/apache/lucene/util/ArrayUtil.java --- trunk_2/lucene/src/java/org/apache/lucene/util/ArrayUtil.java 2011-03-27 09:34:45.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/util/ArrayUtil.java 2011-05-17 16:46:45.000000000 +0200 @@ -255,6 +255,19 @@ return grow(array, 1 + array.length); } + public static double[] grow(double[] array, int minSize) { + if (array.length < minSize) { + double[] newArray = new double[oversize(minSize, RamUsageEstimator.NUM_BYTES_DOUBLE)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static double[] grow(double[] array) { + return grow(array, 1 + array.length); + } + public static short[] shrink(short[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); if (newSize != array.length) { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java docvalues/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java --- trunk_2/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java 2010-12-13 18:14:47.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java 2011-05-19 22:32:18.000000000 +0200 @@ -18,6 +18,8 @@ */ import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; /** @@ -78,6 +80,33 @@ } } + + public static class DirectTrackingAllocator extends Allocator { + private final AtomicLong bytesUsed; + + public DirectTrackingAllocator(AtomicLong bytesUsed) { + this(BYTE_BLOCK_SIZE, bytesUsed); + } + + public DirectTrackingAllocator(int blockSize, AtomicLong bytesUsed) { + super(blockSize); + this.bytesUsed = bytesUsed; + } + + public byte[] getByteBlock() { + bytesUsed.addAndGet(blockSize); + return new byte[blockSize]; + } + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + bytesUsed.addAndGet(-((end-start)* blockSize)); + for (int i = start; i < end; i++) { + blocks[i] = null; + } + } + + }; + public byte[][] buffers = new byte[10][]; @@ -92,6 +121,20 @@ public ByteBlockPool(Allocator allocator) { this.allocator = allocator; } + + public void dropBuffersAndReset() { + if (bufferUpto != -1) { + // Recycle all but the first buffer + allocator.recycleByteBlocks(buffers, 0, 1+bufferUpto); + + // Re-use the first buffer + bufferUpto = -1; + byteUpto = BYTE_BLOCK_SIZE; + byteOffset = -BYTE_BLOCK_SIZE; + buffers = new byte[10][]; + buffer = null; + } + } public void reset() { if (bufferUpto != -1) { @@ -115,7 +158,7 @@ buffer = buffers[0]; } } - + public void nextBuffer() { if (1+bufferUpto == buffers.length) { byte[][] newBuffers = new byte[ArrayUtil.oversize(buffers.length+1, diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/BytesRefHash.java docvalues/lucene/src/java/org/apache/lucene/util/BytesRefHash.java --- trunk_2/lucene/src/java/org/apache/lucene/util/BytesRefHash.java 2011-05-30 04:10:50.000000000 +0200 +++ docvalues/lucene/src/java/org/apache/lucene/util/BytesRefHash.java 2011-06-04 00:22:11.000000000 +0200 @@ -227,8 +227,9 @@ public void clear(boolean resetPool) { lastCount = count; count = 0; - if (resetPool) - pool.reset(); + if (resetPool) { + pool.dropBuffersAndReset(); + } bytesStart = bytesStartArray.clear(); if (lastCount != -1 && shrink(lastCount)) { // shrink clears the hash entries @@ -240,6 +241,16 @@ public void clear() { clear(true); } + + /** + * Closes the BytesRefHash and releases all internally used memory + */ + public void close() { + clear(true); + ords = null; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT + * -hashSize); + } /** * Adds a new {@link BytesRef} @@ -332,6 +343,7 @@ // 1 byte to store length buffer[bufferUpto] = (byte) length; pool.byteUpto += length + 1; + assert length >= 0: "Length must be positive: " + length; System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1, length); } else { @@ -452,8 +464,14 @@ * effect. */ public void reinit() { - if (bytesStart == null) + if (bytesStart == null) { bytesStart = bytesStartArray.init(); + } + + if (ords == null) { + ords = new int[hashSize]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize); + } } /** @@ -514,17 +532,62 @@ */ public abstract AtomicLong bytesUsed(); } + + /** + * A direct {@link BytesStartArray} that tracks all memory allocation using an {@link AtomicLong} instance. + */ + public static class TrackingDirectBytesStartArray extends BytesStartArray { + protected final int initSize; + private int[] bytesStart; + protected final AtomicLong bytesUsed; + + public TrackingDirectBytesStartArray(int initSize, AtomicLong bytesUsed) { + this.initSize = initSize; + this.bytesUsed = bytesUsed; + } - public static class DirectBytesStartArray extends BytesStartArray { + @Override + public int[] clear() { + if (bytesStart != null) { + bytesUsed.addAndGet(-bytesStart.length * RamUsageEstimator.NUM_BYTES_INT); + } + return bytesStart = null; + } + @Override + public int[] grow() { + assert bytesStart != null; + final int oldSize = bytesStart.length; + bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1); + bytesUsed.addAndGet((bytesStart.length - oldSize) * RamUsageEstimator.NUM_BYTES_INT); + return bytesStart; + } + + @Override + public int[] init() { + bytesStart = new int[ArrayUtil.oversize(initSize, + RamUsageEstimator.NUM_BYTES_INT)]; + bytesUsed.addAndGet((bytesStart.length) * RamUsageEstimator.NUM_BYTES_INT); + return bytesStart; + } + + @Override + public AtomicLong bytesUsed() { + return bytesUsed; + } + } + + public static class DirectBytesStartArray extends BytesStartArray { protected final int initSize; private int[] bytesStart; - private final AtomicLong bytesUsed = new AtomicLong(0); - + private final AtomicLong bytesUsed; + public DirectBytesStartArray(int initSize) { + this.bytesUsed = new AtomicLong(0); this.initSize = initSize; } + @Override public int[] clear() { return bytesStart = null; @@ -546,6 +609,5 @@ public AtomicLong bytesUsed() { return bytesUsed; } - } } diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/FloatsRef.java docvalues/lucene/src/java/org/apache/lucene/util/FloatsRef.java --- trunk_2/lucene/src/java/org/apache/lucene/util/FloatsRef.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/util/FloatsRef.java 2011-05-19 22:32:18.000000000 +0200 @@ -0,0 +1,109 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Represents double[], as a slice (offset + length) into an existing float[]. + * + * @lucene.internal + */ +public final class FloatsRef implements Cloneable{ + public double[] floats; + public int offset; + public int length; + + public FloatsRef() { + } + + public FloatsRef(int capacity) { + floats = new double[capacity]; + } + + public void set(double value) { + floats[offset] = value; + } + + public double get() { + return floats[offset]; + } + + public FloatsRef(double[] floats, int offset, int length) { + this.floats = floats; + this.offset = offset; + this.length = length; + } + + public FloatsRef(FloatsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new FloatsRef(this); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for(int i = offset; i < end; i++) { + long value = Double.doubleToLongBits(floats[i]); + result = prime * result + (int) (value ^ (value >>> 32)); + } + return result; + } + + @Override + public boolean equals(Object other) { + return other instanceof FloatsRef && this.floatsEquals((FloatsRef) other); + } + + public boolean floatsEquals(FloatsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final double[] otherFloats = other.floats; + final int end = offset + length; + for(int upto=offset;upto>> 32)); + } + return result; + } + + @Override + public boolean equals(Object other) { + return this.intsEquals((LongsRef) other); + } + + public boolean intsEquals(LongsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final long[] otherInts = other.ints; + final int end = offset + length; + for (int upto = offset; upto < end; upto++, otherUpto++) { + if (ints[upto] != otherInts[otherUpto]) { + return false; + } + } + return true; + } else { + return false; + } + } + + public void copy(LongsRef other) { + if (ints == null) { + ints = new long[other.length]; + } else { + ints = ArrayUtil.grow(ints, other.length); + } + System.arraycopy(other.ints, other.offset, ints, 0, other.length); + length = other.length; + offset = 0; + } + + public void grow(int newLength) { + if (ints.length < newLength) { + ints = ArrayUtil.grow(ints, newLength); + } + } +} \ No newline at end of file diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/PagedBytes.java docvalues/lucene/src/java/org/apache/lucene/util/PagedBytes.java --- trunk_2/lucene/src/java/org/apache/lucene/util/PagedBytes.java 2010-11-30 12:59:55.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/util/PagedBytes.java 2010-12-03 12:27:50.000000000 +0100 @@ -99,7 +99,7 @@ } return b; } - + /** * Reads length as 1 or 2 byte vInt prefix, starting at start. *

@@ -184,6 +184,55 @@ } return start; } + + + /** + * Gets a slice out of {@link PagedBytes} starting at start, the + * length is read as 1 or 2 byte vInt prefix. Iff the slice spans across a + * block border this method will allocate sufficient resources and copy the + * paged data. + *

+ * Slices spanning more than one block are not supported. + *

+ * + * @lucene.internal + **/ + public BytesRef fillSliceWithPrefix(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + int offset = (int) (start & blockMask); + final byte[] block = blocks[index]; + final int length; + if ((block[offset] & 128) == 0) { + length = block[offset]; + offset = offset+1; + } else { + length = ((block[offset] & 0x7f) << 8) | (block[1+offset] & 0xff); + offset = offset+2; + assert length > 0; + } + assert length >= 0: "length=" + length; + b.length = length; + if (blockSize - offset >= length) { + // Within block + b.offset = offset; + b.bytes = blocks[index]; + } else { + // Split + byte[] buffer = threadBuffers.get(); + if (buffer == null) { + buffer = new byte[length]; + threadBuffers.set(buffer); + } else if (buffer.length < length) { + buffer = ArrayUtil.grow(buffer, length); + threadBuffers.set(buffer); + } + b.bytes = buffer; + b.offset = 0; + System.arraycopy(blocks[index], offset, buffer, 0, blockSize-offset); + System.arraycopy(blocks[1+index], 0, buffer, blockSize-offset, length-(blockSize-offset)); + } + return b; + } /** @lucene.internal */ public byte[][] getBlocks() { diff -ruN -x .svn -x build trunk_2/lucene/src/java/org/apache/lucene/util/packed/Packed64.java docvalues/lucene/src/java/org/apache/lucene/util/packed/Packed64.java --- trunk_2/lucene/src/java/org/apache/lucene/util/packed/Packed64.java 2011-02-01 17:08:46.000000000 +0100 +++ docvalues/lucene/src/java/org/apache/lucene/util/packed/Packed64.java 2011-02-09 11:13:36.000000000 +0100 @@ -182,7 +182,7 @@ final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); final int base = bitPos * FAC_BITPOS; - + assert elementPos < blocks.length : "elementPos: " + elementPos + "; blocks.len: " + blocks.length; return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); } diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/TestExternalCodecs.java docvalues/lucene/src/test/org/apache/lucene/TestExternalCodecs.java --- trunk_2/lucene/src/test/org/apache/lucene/TestExternalCodecs.java 2011-05-28 09:04:49.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/TestExternalCodecs.java 2011-05-17 16:46:32.000000000 +0200 @@ -490,11 +490,21 @@ } @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return null; + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return null; + } + + @Override public void getExtensions(Set extensions) { } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) { + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) { } } diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/index/TestCodecs.java docvalues/lucene/src/test/org/apache/lucene/index/TestCodecs.java --- trunk_2/lucene/src/test/org/apache/lucene/index/TestCodecs.java 2011-05-28 09:04:49.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/index/TestCodecs.java 2011-05-17 16:46:29.000000000 +0200 @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.HashSet; +import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java docvalues/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java --- trunk_2/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java 2011-05-28 09:04:49.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java 2011-05-17 16:46:30.000000000 +0200 @@ -33,10 +33,15 @@ import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CoreCodecProvider; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.TermsIndexReaderBase; @@ -192,15 +197,17 @@ static final String PROX_EXTENSION = "prx"; @Override - public void files(Directory dir, SegmentInfo segmentInfo, String id, Set files) throws IOException { - StandardPostingsReader.files(dir, segmentInfo, id, files); - BlockTermsReader.files(dir, segmentInfo, id, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, id, files); + public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { + StandardPostingsReader.files(dir, segmentInfo, ""+id, files); + BlockTermsReader.files(dir, segmentInfo, ""+id, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, ""+id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); } @Override public void getExtensions(Set extensions) { getStandardExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); } public static void getStandardExtensions(Set extensions) { @@ -209,6 +216,16 @@ BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); + } } public void testRandom() throws Exception { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/index/TestFieldInfos.java docvalues/lucene/src/test/org/apache/lucene/index/TestFieldInfos.java --- trunk_2/lucene/src/test/org/apache/lucene/index/TestFieldInfos.java 2011-03-27 09:34:44.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/index/TestFieldInfos.java 2011-05-17 16:46:30.000000000 +0200 @@ -137,7 +137,7 @@ try { readOnly.addOrUpdate("bogus", random.nextBoolean(), random.nextBoolean(), random.nextBoolean(), random.nextBoolean(), random.nextBoolean(), - random.nextBoolean(), random.nextBoolean()); + random.nextBoolean(), random.nextBoolean(), null); fail("instance should be read only"); } catch (IllegalStateException e) { // expected diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java docvalues/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java --- trunk_2/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java 2011-06-03 22:43:05.000000000 +0200 @@ -0,0 +1,326 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.values.IndexDocValues.SortedSource; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; + +public class TestDocValues extends LuceneTestCase { + + // TODO -- for sorted test, do our own Sort of the + // values and verify it's identical + + public void testBytesStraight() throws IOException { + runTestBytes(Bytes.Mode.STRAIGHT, true); + runTestBytes(Bytes.Mode.STRAIGHT, false); + } + + public void testBytesDeref() throws IOException { + runTestBytes(Bytes.Mode.DEREF, true); + runTestBytes(Bytes.Mode.DEREF, false); + } + + public void testBytesSorted() throws IOException { + runTestBytes(Bytes.Mode.SORTED, true); + runTestBytes(Bytes.Mode.SORTED, false); + } + + public void runTestBytes(final Bytes.Mode mode, final boolean fixedSize) + throws IOException { + + final BytesRef bytesRef = new BytesRef(); + + final Comparator comp = mode == Bytes.Mode.SORTED ? BytesRef + .getUTF8SortedAsUnicodeComparator() : null; + + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Bytes.getWriter(dir, "test", mode, comp, fixedSize, trackBytes); + int maxDoc = 220; + final String[] values = new String[maxDoc]; + final int fixedLength = 3 + random.nextInt(7); + for (int i = 0; i < 100; i++) { + final String s; + if (i > 0 && random.nextInt(5) <= 2) { + // use prior value + s = values[2 * random.nextInt(i)]; + } else { + s = _TestUtil.randomFixedByteLengthUnicodeString(random, fixedSize? fixedLength : 1 + random.nextInt(39)); + } + values[2 * i] = s; + + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytesRef); + w.add(2 * i, bytesRef); + } + w.finish(maxDoc); + assertEquals(0, trackBytes.get()); + + IndexDocValues r = Bytes.getValues(dir, "test", mode, fixedSize, maxDoc); + for (int iter = 0; iter < 2; iter++) { + ValuesEnum bytesEnum = getEnum(r); + assertNotNull("enum is null", bytesEnum); + BytesRef ref = bytesEnum.bytes(); + + for (int i = 0; i < 2; i++) { + final int idx = 2 * i; + assertEquals("doc: " + idx, idx, bytesEnum.advance(idx)); + String utf8String = ref.utf8ToString(); + assertEquals("doc: " + idx + " lenLeft: " + values[idx].length() + + " lenRight: " + utf8String.length(), values[idx], utf8String); + } + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc)); + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc + 1)); + + bytesEnum.close(); + } + + // Verify we can load source twice: + for (int iter = 0; iter < 2; iter++) { + Source s; + IndexDocValues.SortedSource ss; + if (mode == Bytes.Mode.SORTED) { + s = ss = getSortedSource(r, comp); + } else { + s = getSource(r); + ss = null; + } + for (int i = 0; i < 100; i++) { + final int idx = 2 * i; + assertNotNull("doc " + idx + "; value=" + values[idx], s.getBytes(idx, + bytesRef)); + assertEquals("doc " + idx, values[idx], s.getBytes(idx, bytesRef) + .utf8ToString()); + if (ss != null) { + assertEquals("doc " + idx, values[idx], ss.getByOrd(ss.ord(idx), + bytesRef).utf8ToString()); + int ord = ss + .getByValue(new BytesRef(values[idx])); + assertTrue(ord >= 0); + assertEquals(ss.ord(idx), ord); + } + } + + // Lookup random strings: + if (mode == Bytes.Mode.SORTED) { + final int numValues = ss.getValueCount(); + for (int i = 0; i < 1000; i++) { + BytesRef bytesValue = new BytesRef(_TestUtil.randomFixedByteLengthUnicodeString(random, fixedSize? fixedLength : 1 + random.nextInt(39))); + int ord = ss.getByValue(bytesValue); + if (ord >= 0) { + assertTrue(bytesValue + .bytesEquals(ss.getByOrd(ord, bytesRef))); + int count = 0; + for (int k = 0; k < 100; k++) { + if (bytesValue.utf8ToString().equals(values[2 * k])) { + assertEquals(ss.ord(2 * k), ord); + count++; + } + } + assertTrue(count > 0); + } else { + assert ord < 0; + int insertIndex = (-ord)-1; + if (insertIndex == 0) { + final BytesRef firstRef = ss.getByOrd(1, bytesRef); + // random string was before our first + assertTrue(firstRef.compareTo(bytesValue) > 0); + } else if (insertIndex == numValues) { + final BytesRef lastRef = ss.getByOrd(numValues-1, bytesRef); + // random string was after our last + assertTrue(lastRef.compareTo(bytesValue) < 0); + } else { + final BytesRef before = (BytesRef) ss.getByOrd(insertIndex-1, bytesRef) + .clone(); + BytesRef after = ss.getByOrd(insertIndex, bytesRef); + assertTrue(comp.compare(before, bytesValue) < 0); + assertTrue(comp.compare(bytesValue, after) < 0); + } + } + } + } + } + + r.close(); + dir.close(); + } + + public void testInts() throws IOException { + long[] maxMin = new long[] { + Long.MIN_VALUE, Long.MAX_VALUE, + 1, Long.MAX_VALUE, + 0, Long.MAX_VALUE, + -1, Long.MAX_VALUE, + Long.MIN_VALUE, -1, + random.nextInt(), random.nextInt() }; + for (int j = 0; j < maxMin.length; j+=2) { + long maxV = 1; + final int NUM_VALUES = 777 + random.nextInt(777); + final long[] values = new long[NUM_VALUES]; + for (int rx = 1; rx < 63; rx++, maxV *= 2) { + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Ints.getWriter(dir, "test", false, trackBytes); + values[0] = maxMin[j]; + w.add(0, values[0]); + values[1] = maxMin[j+1]; + w.add(1, values[1]); + for (int i = 2; i < NUM_VALUES; i++) { + final long v = random.nextLong() % (1 + maxV); + values[i] = v; + w.add(i, v); + } + final int additionalDocs = 1 + random.nextInt(9); + w.finish(NUM_VALUES + additionalDocs); + assertEquals(0, trackBytes.get()); + + IndexDocValues r = Ints.getValues(dir, "test", false); + for (int iter = 0; iter < 2; iter++) { + Source s = getSource(r); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = s.getInt(i); + assertEquals("index " + i, values[i], v); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = getEnum(r); + LongsRef ints = iEnum.getInt(); + for (int i = 0; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.nextDoc()); + if (i < NUM_VALUES) { + assertEquals(values[i], ints.get()); + } else { + assertEquals(0, ints.get()); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.nextDoc()); + iEnum.close(); + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = getEnum(r); + LongsRef ints = iEnum.getInt(); + for (int i = 0; i < NUM_VALUES + additionalDocs; i += 1 + random.nextInt(25)) { + assertEquals(i, iEnum.advance(i)); + if (i < NUM_VALUES) { + assertEquals(values[i], ints.get()); + } else { + assertEquals(0, ints.get()); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, iEnum.advance(NUM_VALUES + additionalDocs)); + iEnum.close(); + } + r.close(); + dir.close(); + } + } + } + + public void testFloats4() throws IOException { + runTestFloats(4, 0.00001); + } + + private void runTestFloats(int precision, double delta) throws IOException { + Directory dir = newDirectory(); + final AtomicLong trackBytes = new AtomicLong(0); + Writer w = Floats.getWriter(dir, "test", precision, trackBytes); + final int NUM_VALUES = 777 + random.nextInt(777);; + final double[] values = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + final double v = precision == 4 ? random.nextFloat() : random + .nextDouble(); + values[i] = v; + w.add(i, v); + } + final int additionalValues = 1 + random.nextInt(10); + w.finish(NUM_VALUES + additionalValues); + assertEquals(0, trackBytes.get()); + + IndexDocValues r = Floats.getValues(dir, "test", NUM_VALUES + additionalValues); + for (int iter = 0; iter < 2; iter++) { + Source s = getSource(r); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(values[i], s.getFloat(i), 0.0f); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = getEnum(r); + FloatsRef floats = fEnum.getFloat(); + for (int i = 0; i < NUM_VALUES + additionalValues; i++) { + assertEquals(i, fEnum.nextDoc()); + if (i < NUM_VALUES) { + assertEquals(values[i], floats.get(), delta); + } else { + assertEquals(0.0d, floats.get(), delta); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, fEnum.nextDoc()); + fEnum.close(); + } + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = getEnum(r); + FloatsRef floats = fEnum.getFloat(); + for (int i = 0; i < NUM_VALUES + additionalValues; i += 1 + random.nextInt(25)) { + assertEquals(i, fEnum.advance(i)); + if (i < NUM_VALUES) { + assertEquals(values[i], floats.get(), delta); + } else { + assertEquals(0.0d, floats.get(), delta); + } + } + assertEquals(ValuesEnum.NO_MORE_DOCS, fEnum.advance(NUM_VALUES + additionalValues)); + fEnum.close(); + } + + r.close(); + dir.close(); + } + + public void testFloats8() throws IOException { + runTestFloats(8, 0.0); + } + + private ValuesEnum getEnum(IndexDocValues values) throws IOException { + return random.nextBoolean() ? values.getEnum() : getSource(values).getEnum(); + } + + private Source getSource(IndexDocValues values) throws IOException { + // getSource uses cache internally + return random.nextBoolean() ? values.load() : values.getSource(); + } + + private SortedSource getSortedSource(IndexDocValues values, + Comparator comparator) throws IOException { + // getSortedSource uses cache internally + return random.nextBoolean() ? values.loadSorted(comparator) : values + .getSortedSorted(comparator); + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java docvalues/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java --- trunk_2/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java 1970-01-01 01:00:00.000000000 +0100 +++ docvalues/lucene/src/test/org/apache/lucene/index/values/TestDocValuesIndexing.java 2011-06-04 00:22:11.000000000 +0200 @@ -0,0 +1,577 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.AbstractField; +import org.apache.lucene.document.IndexDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LogDocMergePolicy; +import org.apache.lucene.index.LogMergePolicy; +import org.apache.lucene.index.MultiPerDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.PerDocValues; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FloatsRef; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util._TestUtil; +import org.junit.Before; + +/** + * + * Tests DocValues integration into IndexWriter & Codecs + * + */ +public class TestDocValuesIndexing extends LuceneTestCase { + /* + * - add test for unoptimized case with deletes + * - add multithreaded tests / integrate into stress indexing? + */ + + @Before + public void setUp() throws Exception { + super.setUp(); + assumeFalse("cannot work with preflex codec", CodecProvider.getDefault().getDefaultFieldCodec().equals("PreFlex")); + } + + /* + * Simple test case to show how to use the API + */ + public void testDocValuesSimple() throws CorruptIndexException, IOException, + ParseException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, writerConfig(false)); + for (int i = 0; i < 5; i++) { + Document doc = new Document(); + IndexDocValuesField valuesField = new IndexDocValuesField("docId"); + valuesField.setInt(i); + doc.add(valuesField); + doc.add(new Field("docId", "" + i, Store.NO, Index.ANALYZED)); + writer.addDocument(doc); + } + writer.commit(); + writer.optimize(true); + + writer.close(true); + + IndexReader reader = IndexReader.open(dir, null, true, 1); + assertTrue(reader.isOptimized()); + + IndexSearcher searcher = new IndexSearcher(reader); + QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "docId", + new MockAnalyzer(random)); + TopDocs search = searcher.search(parser.parse("0 OR 1 OR 2 OR 3 OR 4"), 10); + assertEquals(5, search.totalHits); + ScoreDoc[] scoreDocs = search.scoreDocs; + IndexDocValues docValues = MultiPerDocValues.getPerDocs(reader).docValues("docId"); + Source source = docValues.getSource(); + for (int i = 0; i < scoreDocs.length; i++) { + assertEquals(i, scoreDocs[i].doc); + assertEquals(i, source.getInt(scoreDocs[i].doc)); + } + reader.close(); + dir.close(); + } + + /** + * Tests complete indexing of {@link ValueType} including deletions, merging and + * sparse value fields on Compound-File + */ + public void testIndexBytesNoDeletesCFS() throws IOException { + runTestIndexBytes(writerConfig(true), false); + } + + public void testIndexBytesDeletesCFS() throws IOException { + runTestIndexBytes(writerConfig(true), true); + } + + public void testIndexNumericsNoDeletesCFS() throws IOException { + runTestNumerics(writerConfig(true), false); + } + + public void testIndexNumericsDeletesCFS() throws IOException { + runTestNumerics(writerConfig(true), true); + } + + /** + * Tests complete indexing of {@link ValueType} including deletions, merging and + * sparse value fields on None-Compound-File + */ + public void testIndexBytesNoDeletes() throws IOException { + runTestIndexBytes(writerConfig(false), false); + } + + public void testIndexBytesDeletes() throws IOException { + runTestIndexBytes(writerConfig(false), true); + } + + public void testIndexNumericsNoDeletes() throws IOException { + runTestNumerics(writerConfig(false), false); + } + + public void testIndexNumericsDeletes() throws IOException { + runTestNumerics(writerConfig(false), true); + } + + public void testAddIndexes() throws IOException { + int valuesPerIndex = 10; + List values = Arrays.asList(ValueType.values()); + Collections.shuffle(values, random); + ValueType first = values.get(0); + ValueType second = values.get(1); + String msg = "[first=" + first.name() + ", second=" + second.name() + "]"; + // index first index + Directory d_1 = newDirectory(); + IndexWriter w_1 = new IndexWriter(d_1, writerConfig(random.nextBoolean())); + indexValues(w_1, valuesPerIndex, first, values, false, 7); + w_1.commit(); + assertEquals(valuesPerIndex, w_1.maxDoc()); + _TestUtil.checkIndex(d_1, w_1.getConfig().getCodecProvider()); + + // index second index + Directory d_2 = newDirectory(); + IndexWriter w_2 = new IndexWriter(d_2, writerConfig(random.nextBoolean())); + indexValues(w_2, valuesPerIndex, second, values, false, 7); + w_2.commit(); + assertEquals(valuesPerIndex, w_2.maxDoc()); + _TestUtil.checkIndex(d_2, w_2.getConfig().getCodecProvider()); + + Directory target = newDirectory(); + IndexWriter w = new IndexWriter(target, writerConfig(random.nextBoolean())); + IndexReader r_1 = IndexReader.open(w_1, true); + IndexReader r_2 = IndexReader.open(w_2, true); + if (random.nextBoolean()) { + w.addIndexes(d_1, d_2); + } else { + w.addIndexes(r_1, r_2); + } + w.optimize(true); + w.commit(); + + _TestUtil.checkIndex(target, w.getConfig().getCodecProvider()); + assertEquals(valuesPerIndex * 2, w.maxDoc()); + + // check values + + IndexReader merged = IndexReader.open(w, true); + ValuesEnum vE_1 = getValuesEnum(getDocValues(r_1, first.name())); + ValuesEnum vE_2 = getValuesEnum(getDocValues(r_2, second.name())); + ValuesEnum vE_1_merged = getValuesEnum(getDocValues(merged, first.name())); + ValuesEnum vE_2_merged = getValuesEnum(getDocValues(merged, second + .name())); + switch (second) { // these variants don't advance over missing values + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_STRAIGHT: + case FLOAT_32: + case FLOAT_64: + case INTS: + assertEquals(msg, valuesPerIndex-1, vE_2_merged.advance(valuesPerIndex-1)); + } + + for (int i = 0; i < valuesPerIndex; i++) { + assertEquals(msg, i, vE_1.nextDoc()); + assertEquals(msg, i, vE_1_merged.nextDoc()); + + assertEquals(msg, i, vE_2.nextDoc()); + assertEquals(msg, i + valuesPerIndex, vE_2_merged.nextDoc()); + } + assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_1.nextDoc()); + assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_2.nextDoc()); + assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_1_merged.advance(valuesPerIndex*2)); + assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_2_merged.nextDoc()); + + // close resources + r_1.close(); + r_2.close(); + merged.close(); + w_1.close(true); + w_2.close(true); + w.close(true); + d_1.close(); + d_2.close(); + target.close(); + } + + private IndexWriterConfig writerConfig(boolean useCompoundFile) { + final IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)); + cfg.setMergePolicy(newLogMergePolicy(random)); + LogMergePolicy policy = new LogDocMergePolicy(); + cfg.setMergePolicy(policy); + policy.setUseCompoundFile(useCompoundFile); + return cfg; + } + + public void runTestNumerics(IndexWriterConfig cfg, boolean withDeletions) + throws IOException { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final int numValues = 179 + random.nextInt(151); + final List numVariantList = new ArrayList(NUMERICS); + + // run in random order to test if fill works correctly during merges + Collections.shuffle(numVariantList, random); + for (ValueType val : numVariantList) { + OpenBitSet deleted = indexValues(w, numValues, val, numVariantList, + withDeletions, 7); + List closeables = new ArrayList(); + IndexReader r = IndexReader.open(w, true); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + switch (val) { + case INTS: { + IndexDocValues intsReader = getDocValues(r, val.name()); + assertNotNull(intsReader); + + Source ints = getSource(intsReader); + + for (int i = 0; i < base; i++) { + long value = ints.getInt(i); + assertEquals("index " + i, 0, value); + } + + ValuesEnum intsEnum = getValuesEnum(intsReader); + assertTrue(intsEnum.advance(base) >= base); + + intsEnum = getValuesEnum(intsReader); + LongsRef enumRef = intsEnum.getInt(); + + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + + " docs", i, intsEnum.advance(i)); + assertEquals(expected, ints.getInt(i)); + assertEquals(expected, enumRef.get()); + + } + } + break; + case FLOAT_32: + case FLOAT_64: { + IndexDocValues floatReader = getDocValues(r, val.name()); + assertNotNull(floatReader); + Source floats = getSource(floatReader); + for (int i = 0; i < base; i++) { + double value = floats.getFloat(i); + assertEquals(val + " failed for doc: " + i + " base: " + base, + 0.0d, value, 0.0d); + } + ValuesEnum floatEnum = getValuesEnum(floatReader); + assertTrue(floatEnum.advance(base) >= base); + + floatEnum = getValuesEnum(floatReader); + FloatsRef enumRef = floatEnum.getFloat(); + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + + " docs base:" + base, i, floatEnum.advance(i)); + assertEquals(floatEnum.getClass() + " index " + i, 2.0 * expected, + enumRef.get(), 0.00001); + assertEquals("index " + i, 2.0 * expected, floats.getFloat(i), + 0.00001); + } + } + break; + default: + fail("unexpected value " + val); + } + + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + w.close(); + d.close(); + } + + public void runTestIndexBytes(IndexWriterConfig cfg, boolean withDeletions) + throws CorruptIndexException, LockObtainFailedException, IOException { + final Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final List byteVariantList = new ArrayList(BYTES); + // run in random order to test if fill works correctly during merges + Collections.shuffle(byteVariantList, random); + final int numValues = 179 + random.nextInt(151); + for (ValueType byteIndexValue : byteVariantList) { + List closeables = new ArrayList(); + + int bytesSize = 7 + random.nextInt(128); + OpenBitSet deleted = indexValues(w, numValues, byteIndexValue, + byteVariantList, withDeletions, bytesSize); + final IndexReader r = IndexReader.open(w, withDeletions); + assertEquals(0, r.numDeletedDocs()); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + IndexDocValues bytesReader = getDocValues(r, byteIndexValue.name()); + assertNotNull("field " + byteIndexValue.name() + + " returned null reader - maybe merged failed", bytesReader); + Source bytes = getSource(bytesReader); + byte upto = 0; + + // test the filled up slots for correctness + for (int i = 0; i < base; i++) { + + BytesRef br = bytes.getBytes(i, new BytesRef()); + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs(); + switch (byteIndexValue) { + case BYTES_VAR_STRAIGHT: + case BYTES_FIXED_STRAIGHT: + // fixed straight returns bytesref with zero bytes all of fixed + // length + assertNotNull("expected none null - " + msg, br); + if (br.length != 0) { + assertEquals("expected zero bytes of length " + bytesSize + " - " + + msg, bytesSize, br.length); + for (int j = 0; j < br.length; j++) { + assertEquals("Byte at index " + j + " doesn't match - " + msg, 0, + br.bytes[br.offset + j]); + } + } + break; + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + case BYTES_VAR_DEREF: + case BYTES_FIXED_DEREF: + default: + assertNotNull("expected none null - " + msg, br); + assertEquals(0, br.length); + // make sure we advance at least until base + ValuesEnum bytesEnum = getValuesEnum(bytesReader); + try { + + final int advancedTo = bytesEnum.advance(0); + assertTrue(byteIndexValue.name() + " advanced failed base:" + base + + " advancedTo: " + advancedTo, base <= advancedTo); + }catch(Throwable e) { + final int advancedTo = bytesEnum.advance(0); + assertTrue(byteIndexValue.name() + " advanced failed base:" + base + + " advancedTo: " + advancedTo, base <= advancedTo); + + } + } + } + + ValuesEnum bytesEnum = getValuesEnum(bytesReader); + final BytesRef enumRef = bytesEnum.bytes(); + // test the actual doc values added in this iteration + assertEquals(base + numRemainingValues, r.numDocs()); + int v = 0; + for (int i = base; i < r.numDocs(); i++) { + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs() + " bytesSize: " + + bytesSize + " src: " + bytes; + while (withDeletions && deleted.get(v++)) { + upto += bytesSize; + } + + BytesRef br = bytes.getBytes(i, new BytesRef()); + if (bytesEnum.docID() != i) { + assertEquals("seek failed for index " + i + " " + msg, i, bytesEnum + .advance(i)); + } + for (int j = 0; j < br.length; j++, upto++) { + assertTrue(" enumRef not initialized " + msg, + enumRef.bytes.length > 0); + assertEquals( + "EnumRef Byte at index " + j + " doesn't match - " + msg, upto, + enumRef.bytes[enumRef.offset + j]); + if (!(br.bytes.length > br.offset + j)) + br = bytes.getBytes(i, new BytesRef()); + assertTrue("BytesRef index exceeded [" + msg + "] offset: " + + br.offset + " length: " + br.length + " index: " + + (br.offset + j), br.bytes.length > br.offset + j); + assertEquals("SourceRef Byte at index " + j + " doesn't match - " + + msg, upto, br.bytes[br.offset + j]); + } + } + + // clean up + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + + w.close(); + d.close(); + } + + private IndexDocValues getDocValues(IndexReader reader, String field) + throws IOException { + boolean optimized = reader.isOptimized(); + PerDocValues perDoc = optimized ? reader.getSequentialSubReaders()[0].perDocValues() + : MultiPerDocValues.getPerDocs(reader); + switch (random.nextInt(optimized ? 3 : 2)) { // case 2 only if optimized + case 0: + return perDoc.docValues(field); + case 1: + IndexDocValues docValues = perDoc.docValues(field); + if (docValues != null) { + return docValues; + } + throw new RuntimeException("no such field " + field); + case 2:// this only works if we are on an optimized index! + return reader.getSequentialSubReaders()[0].docValues(field); + } + throw new RuntimeException(); + } + + private Source getSource(IndexDocValues values) throws IOException { + Source source; + if (random.nextInt(10) == 0) { + source = values.load(); + } else { + // getSource uses cache internally + source = values.getSource(); + } + assertNotNull(source); + return source; + } + + private ValuesEnum getValuesEnum(IndexDocValues values) throws IOException { + ValuesEnum valuesEnum; + if (!(values instanceof MultiIndexDocValues) && random.nextInt(10) == 0) { + // TODO not supported by MultiDocValues yet! + valuesEnum = getSource(values).getEnum(); + } else { + valuesEnum = values.getEnum(); + + } + assertNotNull(valuesEnum); + return valuesEnum; + } + + private static EnumSet BYTES = EnumSet.of(ValueType.BYTES_FIXED_DEREF, + ValueType.BYTES_FIXED_SORTED, ValueType.BYTES_FIXED_STRAIGHT, ValueType.BYTES_VAR_DEREF, + ValueType.BYTES_VAR_SORTED, ValueType.BYTES_VAR_STRAIGHT); + + private static EnumSet NUMERICS = EnumSet.of(ValueType.INTS, + ValueType.FLOAT_32, ValueType.FLOAT_64); + + private static Index[] IDX_VALUES = new Index[] { Index.ANALYZED, + Index.ANALYZED_NO_NORMS, Index.NOT_ANALYZED, Index.NOT_ANALYZED_NO_NORMS, + Index.NO }; + + private OpenBitSet indexValues(IndexWriter w, int numValues, ValueType value, + List valueVarList, boolean withDeletions, int multOfSeven) + throws CorruptIndexException, IOException { + final boolean isNumeric = NUMERICS.contains(value); + OpenBitSet deleted = new OpenBitSet(numValues); + Document doc = new Document(); + Index idx = IDX_VALUES[random.nextInt(IDX_VALUES.length)]; + AbstractField field = random.nextBoolean() ? new IndexDocValuesField(value.name()) + : newField(value.name(), _TestUtil.randomRealisticUnicodeString(random, + 10), idx == Index.NO ? Store.YES : Store.NO, idx); + doc.add(field); + IndexDocValuesField valField = new IndexDocValuesField("prototype"); + final BytesRef bytesRef = new BytesRef(); + + final String idBase = value.name() + "_"; + final byte[] b = new byte[multOfSeven]; + if (bytesRef != null) { + bytesRef.bytes = b; + bytesRef.length = b.length; + bytesRef.offset = 0; + } + byte upto = 0; + for (int i = 0; i < numValues; i++) { + if (isNumeric) { + switch (value) { + case INTS: + valField.setInt(i); + break; + case FLOAT_32: + valField.setFloat(2.0f * i); + break; + case FLOAT_64: + valField.setFloat(2.0d * i); + break; + default: + fail("unexpected value " + value); + } + } else { + for (int j = 0; j < b.length; j++) { + b[j] = upto++; + } + if (bytesRef != null) { + valField.setBytes(bytesRef, value); + } + } + doc.removeFields("id"); + doc.add(new Field("id", idBase + i, Store.YES, + Index.NOT_ANALYZED_NO_NORMS)); + valField.set(field); + w.addDocument(doc); + + if (i % 7 == 0) { + if (withDeletions && random.nextBoolean()) { + ValueType val = valueVarList.get(random.nextInt(1 + valueVarList + .indexOf(value))); + final int randInt = val == value ? random.nextInt(1 + i) : random + .nextInt(numValues); + w.deleteDocuments(new Term("id", val.name() + "_" + randInt)); + if (val == value) { + deleted.set(randInt); + } + } + if (random.nextInt(10) == 0) { + w.commit(); + } + } + } + w.commit(); + + // TODO test unoptimized with deletions + if (withDeletions || random.nextBoolean()) + w.optimize(true); + return deleted; + } +} diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestDateFilter.java docvalues/lucene/src/test/org/apache/lucene/search/TestDateFilter.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestDateFilter.java 2011-02-28 11:48:55.000000000 +0100 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestDateFilter.java 2011-05-17 16:46:27.000000000 +0200 @@ -32,7 +32,7 @@ * DateFilter JUnit tests. * * - * @version $Revision: 1075210 $ + * @version $Revision: 1086181 $ */ public class TestDateFilter extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestDocBoost.java docvalues/lucene/src/test/org/apache/lucene/search/TestDocBoost.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestDocBoost.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestDocBoost.java 2011-05-17 16:46:27.000000000 +0200 @@ -31,7 +31,7 @@ /** Document boost unit test. * * - * @version $Revision: 1091132 $ + * @version $Revision: 1098566 $ */ public class TestDocBoost extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestNot.java docvalues/lucene/src/test/org/apache/lucene/search/TestNot.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestNot.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestNot.java 2011-05-17 16:46:27.000000000 +0200 @@ -30,7 +30,7 @@ /** Similarity unit test. * * - * @version $Revision: 1091132 $ + * @version $Revision: 1098566 $ */ public class TestNot extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java docvalues/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java 2011-05-17 16:46:27.000000000 +0200 @@ -54,7 +54,7 @@ * Term position unit test. * * - * @version $Revision: 1091132 $ + * @version $Revision: 1098566 $ */ public class TestPositionIncrement extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestSetNorm.java docvalues/lucene/src/test/org/apache/lucene/search/TestSetNorm.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestSetNorm.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestSetNorm.java 2011-05-17 16:46:27.000000000 +0200 @@ -31,7 +31,7 @@ /** Document boost unit test. * * - * @version $Revision: 1091132 $ + * @version $Revision: 1098566 $ */ public class TestSetNorm extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestSimilarity.java docvalues/lucene/src/test/org/apache/lucene/search/TestSimilarity.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestSimilarity.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestSimilarity.java 2011-05-17 16:46:27.000000000 +0200 @@ -35,7 +35,7 @@ /** Similarity unit test. * * - * @version $Revision: 1091132 $ + * @version $Revision: 1098566 $ */ public class TestSimilarity extends LuceneTestCase { diff -ruN -x .svn -x build trunk_2/lucene/src/test/org/apache/lucene/search/TestSort.java docvalues/lucene/src/test/org/apache/lucene/search/TestSort.java --- trunk_2/lucene/src/test/org/apache/lucene/search/TestSort.java 2011-05-28 09:04:43.000000000 +0200 +++ docvalues/lucene/src/test/org/apache/lucene/search/TestSort.java 2011-06-03 22:43:04.000000000 +0200 @@ -25,6 +25,7 @@ import java.util.concurrent.TimeUnit; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.IndexDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; @@ -35,6 +36,8 @@ import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.FieldValueHitQueue.Entry; @@ -61,7 +64,8 @@ */ public class TestSort extends LuceneTestCase { - + // true if our codec supports docvalues: true unless codec is preflex (3.x) + boolean supportsDocValues = CodecProvider.getDefault().getDefaultFieldCodec().equals("PreFlex") == false; private static final int NUM_STRINGS = 6000 * RANDOM_MULTIPLIER; private IndexSearcher full; private IndexSearcher searchX; @@ -118,13 +122,28 @@ Document doc = new Document(); doc.add (new Field ("tracer", data[i][0], Field.Store.YES, Field.Index.NO)); doc.add (new Field ("contents", data[i][1], Field.Store.NO, Field.Index.ANALYZED)); - if (data[i][2] != null) doc.add (new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED)); - if (data[i][3] != null) doc.add (new Field ("float", data[i][3], Field.Store.NO, Field.Index.NOT_ANALYZED)); + if (data[i][2] != null) { + Field f = supportsDocValues ? + IndexDocValuesField.set(new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED), ValueType.INTS) + : new Field ("int", data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(f); + } + if (data[i][3] != null) { + Field f = supportsDocValues ? + IndexDocValuesField.set(new Field ("float", data[i][3], Field.Store.NO, Field.Index.NOT_ANALYZED), ValueType.FLOAT_32) + : new Field ("float", data[i][3], Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(f); + } if (data[i][4] != null) doc.add (new Field ("string", data[i][4], Field.Store.NO, Field.Index.NOT_ANALYZED)); if (data[i][5] != null) doc.add (new Field ("custom", data[i][5], Field.Store.NO, Field.Index.NOT_ANALYZED)); if (data[i][6] != null) doc.add (new Field ("i18n", data[i][6], Field.Store.NO, Field.Index.NOT_ANALYZED)); if (data[i][7] != null) doc.add (new Field ("long", data[i][7], Field.Store.NO, Field.Index.NOT_ANALYZED)); - if (data[i][8] != null) doc.add (new Field ("double", data[i][8], Field.Store.NO, Field.Index.NOT_ANALYZED)); + if (data[i][8] != null) { + Field f = supportsDocValues ? + IndexDocValuesField.set(new Field ("double", data[i][8], Field.Store.NO, Field.Index.NOT_ANALYZED), ValueType.FLOAT_64) + : new Field ("double", data[i][8], Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(f); + } if (data[i][9] != null) doc.add (new Field ("short", data[i][9], Field.Store.NO, Field.Index.NOT_ANALYZED)); if (data[i][10] != null) doc.add (new Field ("byte", data[i][10], Field.Store.NO, Field.Index.NOT_ANALYZED)); if (data[i][11] != null) doc.add (new Field ("parser", data[i][11], Field.Store.NO, Field.Index.NOT_ANALYZED)); @@ -217,6 +236,7 @@ @Override public void setUp() throws Exception { super.setUp(); + full = getFullIndex(); searchX = getXIndex(); searchY = getYIndex(); @@ -228,6 +248,7 @@ queryG = new TermQuery (new Term ("contents", "g")); queryM = new TermQuery (new Term ("contents", "m")); sort = new Sort(); + } private ArrayList dirs = new ArrayList(); @@ -256,12 +277,16 @@ assertMatches (full, queryY, sort, "BDFHJ"); } + private static SortField useDocValues(SortField field) { + field.setUseIndexValues(true); + return field; + } // test sorts where the type of field is specified public void testTypedSort() throws Exception { sort.setSort (new SortField ("int", SortField.INT), SortField.FIELD_DOC ); assertMatches (full, queryX, sort, "IGAEC"); assertMatches (full, queryY, sort, "DHFJB"); - + sort.setSort (new SortField ("float", SortField.FLOAT), SortField.FIELD_DOC ); assertMatches (full, queryX, sort, "GCIEA"); assertMatches (full, queryY, sort, "DHJFB"); @@ -273,7 +298,7 @@ sort.setSort (new SortField ("double", SortField.DOUBLE), SortField.FIELD_DOC ); assertMatches (full, queryX, sort, "AGICE"); assertMatches (full, queryY, sort, "DJHBF"); - + sort.setSort (new SortField ("byte", SortField.BYTE), SortField.FIELD_DOC ); assertMatches (full, queryX, sort, "CIGAE"); assertMatches (full, queryY, sort, "DHFBJ"); @@ -285,6 +310,20 @@ sort.setSort (new SortField ("string", SortField.STRING), SortField.FIELD_DOC ); assertMatches (full, queryX, sort, "AIGEC"); assertMatches (full, queryY, sort, "DJHFB"); + + if (supportsDocValues) { + sort.setSort (useDocValues(new SortField ("int", SortField.INT)), SortField.FIELD_DOC ); + assertMatches (full, queryX, sort, "IGAEC"); + assertMatches (full, queryY, sort, "DHFJB"); + + sort.setSort (useDocValues(new SortField ("float", SortField.FLOAT)), SortField.FIELD_DOC ); + assertMatches (full, queryX, sort, "GCIEA"); + assertMatches (full, queryY, sort, "DHJFB"); + + sort.setSort (useDocValues(new SortField ("double", SortField.DOUBLE)), SortField.FIELD_DOC ); + assertMatches (full, queryX, sort, "AGICE"); + assertMatches (full, queryY, sort, "DJHBF"); + } } private static class SortMissingLastTestHelper { @@ -458,12 +497,18 @@ sort.setSort (new SortField ("int", SortField.INT), SortField.FIELD_DOC ); assertMatches (empty, queryX, sort, ""); + + sort.setSort (useDocValues(new SortField ("int", SortField.INT)), SortField.FIELD_DOC ); + assertMatches (empty, queryX, sort, ""); sort.setSort (new SortField ("string", SortField.STRING, true), SortField.FIELD_DOC ); assertMatches (empty, queryX, sort, ""); sort.setSort (new SortField ("float", SortField.FLOAT), new SortField ("string", SortField.STRING) ); assertMatches (empty, queryX, sort, ""); + + sort.setSort (useDocValues(new SortField ("float", SortField.FLOAT)), new SortField ("string", SortField.STRING) ); + assertMatches (empty, queryX, sort, ""); } static class MyFieldComparator extends FieldComparator { @@ -543,10 +588,20 @@ sort.setSort (new SortField ("float", SortField.FLOAT, true) ); assertMatches (full, queryX, sort, "AECIG"); assertMatches (full, queryY, sort, "BFJHD"); - + sort.setSort (new SortField ("string", SortField.STRING, true) ); assertMatches (full, queryX, sort, "CEGIA"); assertMatches (full, queryY, sort, "BFHJD"); + + if (supportsDocValues) { + sort.setSort (useDocValues(new SortField ("int", SortField.INT, true)) ); + assertMatches (full, queryX, sort, "CAEGI"); + assertMatches (full, queryY, sort, "BJFHD"); + + sort.setSort (useDocValues(new SortField ("float", SortField.FLOAT, true)) ); + assertMatches (full, queryX, sort, "AECIG"); + assertMatches (full, queryY, sort, "BFJHD"); + } } // test sorting when the sort field is empty (undefined) for some of the documents @@ -566,6 +621,14 @@ sort.setSort (new SortField ("float", SortField.FLOAT) ); assertMatches (full, queryF, sort, "ZJI"); + if (supportsDocValues) { + sort.setSort (useDocValues(new SortField ("int", SortField.INT)) ); + assertMatches (full, queryF, sort, "IZJ"); + + sort.setSort (useDocValues(new SortField ("float", SortField.FLOAT)) ); + assertMatches (full, queryF, sort, "ZJI"); + } + // using a nonexisting field as first sort key shouldn't make a difference: sort.setSort (new SortField ("nosuchfield", SortField.STRING), new SortField ("float", SortField.FLOAT) ); @@ -887,7 +950,7 @@ sort.setSort(new SortField("int", SortField.INT)); expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; assertMatches(multi, queryA, sort, expected); - + sort.setSort(new SortField ("float", SortField.FLOAT), SortField.FIELD_DOC); assertMatches(multi, queryA, sort, "GDHJCIEFAB"); @@ -928,6 +991,39 @@ sort.setSort(new SortField ("string", SortField.STRING, true)); assertMatches(multi, queryF, sort, "IJZ"); + if (supportsDocValues) { + sort.setSort(useDocValues(new SortField ("int", SortField.INT))); + expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; + assertMatches(multi, queryA, sort, expected); + + sort.setSort(useDocValues(new SortField ("int", SortField.INT)), SortField.FIELD_DOC); + expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; + assertMatches(multi, queryA, sort, expected); + + sort.setSort(useDocValues(new SortField("int", SortField.INT))); + expected = isFull ? "IDHFGJABEC" : "IDHFGJAEBC"; + assertMatches(multi, queryA, sort, expected); + + sort.setSort(useDocValues(new SortField ("float", SortField.FLOAT)), SortField.FIELD_DOC); + assertMatches(multi, queryA, sort, "GDHJCIEFAB"); + + sort.setSort(useDocValues(new SortField("float", SortField.FLOAT))); + assertMatches(multi, queryA, sort, "GDHJCIEFAB"); + + sort.setSort(useDocValues(new SortField("int", SortField.INT, true))); + expected = isFull ? "CABEJGFHDI" : "CAEBJGFHDI"; + assertMatches(multi, queryA, sort, expected); + + sort.setSort(useDocValues(new SortField("int", SortField.INT)), useDocValues(new SortField("float", SortField.FLOAT))); + assertMatches(multi, queryA, sort, "IDHFGJEABC"); + + sort.setSort(useDocValues(new SortField ("int", SortField.INT))); + assertMatches(multi, queryF, sort, "IZJ"); + + sort.setSort(useDocValues(new SortField ("int", SortField.INT, true))); + assertMatches(multi, queryF, sort, "JZI"); + } + // up to this point, all of the searches should have "sane" // FieldCache behavior, and should have reused hte cache in several cases assertSaneFieldCaches(getName() + " various"); diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java docvalues/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java 2011-05-28 09:05:03.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/index/RandomIndexWriter.java 2011-06-03 22:43:06.000000000 +0200 @@ -24,9 +24,13 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.IndexDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; // javadoc +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.ValueType; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Version; import org.apache.lucene.util._TestUtil; @@ -45,6 +49,10 @@ int flushAt; private double flushAtFactor = 1.0; private boolean getReaderCalled; + private final int fixedBytesLength; + private final long docValuesFieldPrefix; + private volatile boolean doDocValues; + private CodecProvider codecProvider; // Randomly calls Thread.yield so we mixup thread scheduling private static final class MockIndexWriter extends IndexWriter { @@ -92,13 +100,32 @@ System.out.println("codec default=" + w.getConfig().getCodecProvider().getDefaultFieldCodec()); w.setInfoStream(System.out); } + /* TODO: find some what to make that random... + * This must be fixed across all fixed bytes + * fields in one index. so if you open another writer + * this might change if I use r.nextInt(x) + * maybe we can peek at the existing files here? + */ + fixedBytesLength = 37; + docValuesFieldPrefix = r.nextLong(); + codecProvider = w.getConfig().getCodecProvider(); + switchDoDocValues(); } + private void switchDoDocValues() { + // randomly enable / disable docValues + doDocValues = r.nextInt(10) != 0; + } + /** * Adds a Document. * @see IndexWriter#addDocument(Document) */ public void addDocument(final Document doc) throws IOException { + if (doDocValues) { + randomPerDocFieldValues(r, doc); + } + if (r.nextInt(5) == 3) { // TODO: maybe, we should simply buffer up added docs // (but we need to clone them), and only when @@ -135,8 +162,53 @@ } else { w.addDocument(doc); } + maybeCommit(); } + + private void randomPerDocFieldValues(Random random, Document doc) { + + ValueType[] values = ValueType.values(); + ValueType type = values[random.nextInt(values.length)]; + String name = "random_" + type.name() + "" + docValuesFieldPrefix; + if ("PreFlex".equals(codecProvider.getFieldCodec(name)) || doc.getFieldable(name) != null) + return; + IndexDocValuesField docValuesField = new IndexDocValuesField(name); + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + final String randomUnicodeString = _TestUtil.randomUnicodeString(random, fixedBytesLength); + BytesRef fixedRef = new BytesRef(randomUnicodeString); + if (fixedRef.length > fixedBytesLength) { + fixedRef = new BytesRef(fixedRef.bytes, 0, fixedBytesLength); + } else { + fixedRef.grow(fixedBytesLength); + fixedRef.length = fixedBytesLength; + } + docValuesField.setBytes(fixedRef, type); + break; + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + BytesRef ref = new BytesRef(_TestUtil.randomUnicodeString(random, 200)); + docValuesField.setBytes(ref, type); + break; + case FLOAT_32: + docValuesField.setFloat(random.nextFloat()); + break; + case FLOAT_64: + docValuesField.setFloat(random.nextDouble()); + break; + case INTS: + docValuesField.setInt(random.nextInt()); + break; + default: + throw new IllegalArgumentException("no such type: " + type); + } + + doc.add(docValuesField); + } private void maybeCommit() throws IOException { if (docCount++ == flushAt) { @@ -149,6 +221,7 @@ // gradually but exponentially increase time b/w flushes flushAtFactor *= 1.05; } + switchDoDocValues(); } } @@ -166,7 +239,11 @@ * Updates a document. * @see IndexWriter#updateDocument(Term, Document) */ - public void updateDocument(Term t, final Document doc) throws IOException { + public void updateDocument(final Term t, final Document doc) throws IOException { + if (doDocValues) { + randomPerDocFieldValues(r, doc); + } + if (r.nextInt(5) == 3) { w.updateDocuments(t, new Iterable() { @@ -212,6 +289,7 @@ public void commit() throws CorruptIndexException, IOException { w.commit(); + switchDoDocValues(); } public int numDocs() throws IOException { @@ -241,6 +319,7 @@ w.optimize(limit); assert w.getSegmentCount() <= limit: "limit=" + limit + " actual=" + w.getSegmentCount(); } + switchDoDocValues(); } public IndexReader getReader(boolean applyDeletions) throws IOException { @@ -261,6 +340,7 @@ System.out.println("RIW.getReader: open new reader"); } w.commit(); + switchDoDocValues(); return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10), w.getConfig().getCodecProvider()); } } diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java 2011-05-29 22:19:09.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java 2011-06-03 22:43:06.000000000 +0200 @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; @@ -33,8 +34,13 @@ import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.BlockTermsReader; @@ -197,10 +203,12 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { + final String codecIdAsString = "" + codecId; + SepPostingsReaderImpl.files(segmentInfo, codecIdAsString, files); + BlockTermsReader.files(dir, segmentInfo, codecIdAsString, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); } @Override @@ -208,5 +216,16 @@ SepPostingsWriterImpl.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); } } diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java 2011-05-29 22:19:09.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java 2011-06-03 22:43:06.000000000 +0200 @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; @@ -33,8 +34,13 @@ import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.BlockTermsReader; @@ -220,10 +226,12 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { + final String codecIdAsString = "" + codecId; + SepPostingsReaderImpl.files(segmentInfo, codecIdAsString, files); + BlockTermsReader.files(dir, segmentInfo, codecIdAsString, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); } @Override @@ -231,5 +239,16 @@ SepPostingsWriterImpl.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); } } diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java 2011-05-29 22:19:09.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java 2011-06-03 22:43:06.000000000 +0200 @@ -26,16 +26,22 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DocValuesConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.TermStats; @@ -134,7 +140,7 @@ System.out.println("MockRandomCodec: writing to seg=" + state.segmentName + " seed=" + seed); } - final String seedFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, SEED_EXT); + final String seedFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecIdAsString(), SEED_EXT); final IndexOutput out = state.directory.createOutput(seedFileName); try { out.writeLong(seed); @@ -235,7 +241,7 @@ @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - final String seedFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.codecId, SEED_EXT); + final String seedFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.codecIdAsString(), SEED_EXT); final IndexInput in = state.dir.openInput(seedFileName); final long seed = in.readLong(); if (LuceneTestCase.VERBOSE) { @@ -341,15 +347,16 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) throws IOException { - final String seedFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecId, SEED_EXT); + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { + final String codecIdAsString = codecId + ""; + final String seedFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecIdAsString, SEED_EXT); files.add(seedFileName); - SepPostingsReaderImpl.files(segmentInfo, codecId, files); - StandardPostingsReader.files(dir, segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - + SepPostingsReaderImpl.files(segmentInfo, codecIdAsString, files); + StandardPostingsReader.files(dir, segmentInfo, codecIdAsString, files); + BlockTermsReader.files(dir, segmentInfo, codecIdAsString, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + VariableGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); // hackish! Iterator it = files.iterator(); while(it.hasNext()) { @@ -367,7 +374,19 @@ BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); extensions.add(SEED_EXT); //System.out.println("MockRandom.getExtensions return " + extensions); } + + // can we make this more evil? + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java 2011-05-28 09:05:03.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java 2011-05-17 16:46:46.000000000 +0200 @@ -20,14 +20,19 @@ import java.io.IOException; import java.util.Set; +import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FixedGapTermsIndexReader; import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter; +import org.apache.lucene.index.codecs.PerDocConsumer; +import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; +import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.BlockTermsReader; @@ -130,15 +135,18 @@ } @Override - public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set files) throws IOException { - SepPostingsReaderImpl.files(segmentInfo, codecId, files); - BlockTermsReader.files(dir, segmentInfo, codecId, files); - FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); + public void files(Directory dir, SegmentInfo segmentInfo, int codecId, Set files) throws IOException { + final String codecIdAsString = "" + codecId; + SepPostingsReaderImpl.files(segmentInfo, codecIdAsString, files); + BlockTermsReader.files(dir, segmentInfo, codecIdAsString, files); + FixedGapTermsIndexReader.files(dir, segmentInfo, codecIdAsString, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); } @Override public void getExtensions(Set extensions) { getSepExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions); } public static void getSepExtensions(Set extensions) { @@ -146,4 +154,14 @@ BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); } + + @Override + public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { + return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + } + + @Override + public PerDocValues docsProducer(SegmentReadState state) throws IOException { + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId); + } } diff -ruN -x .svn -x build trunk_2/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java docvalues/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java --- trunk_2/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java 2011-05-29 22:19:09.000000000 +0200 +++ docvalues/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java 2011-06-03 22:43:07.000000000 +0200 @@ -286,6 +286,48 @@ sb.appendCodePoint(nextInt(r, blockStarts[block], blockEnds[block])); return sb.toString(); } + + /** Returns random string, with a given UTF-8 byte length*/ + public static String randomFixedByteLengthUnicodeString(Random r, int length) { + + final char[] buffer = new char[length*3]; + int bytes = length; + int i = 0; + for (; i < buffer.length && bytes != 0; i++) { + int t; + if (bytes >= 4) { + t = r.nextInt(5); + } else if (bytes >= 3) { + t = r.nextInt(4); + } else if (bytes >= 2) { + t = r.nextInt(2); + } else { + t = 0; + } + if (t == 0) { + buffer[i] = (char) r.nextInt(0x80); + bytes--; + } else if (1 == t) { + buffer[i] = (char) nextInt(r, 0x80, 0x7ff); + bytes -= 2; + } else if (2 == t) { + buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); + bytes -= 3; + } else if (3 == t) { + buffer[i] = (char) nextInt(r, 0xe000, 0xffff); + bytes -= 3; + } else if (4 == t) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); + // Low surrogate + buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff); + bytes -= 4; + } + + } + return new String(buffer, 0, i); + } public static CodecProvider alwaysCodec(final Codec c) { CodecProvider p = new CodecProvider() { @@ -370,7 +412,7 @@ List fields = doc.getFields(); for (Fieldable field : fields) { fieldInfos.addOrUpdate(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), - field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); } } diff -ruN -x .svn -x build trunk_2/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java --- trunk_2/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java 2011-05-28 09:04:20.000000000 +0200 +++ docvalues/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java 2011-05-19 22:32:06.000000000 +0200 @@ -33,7 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*; /** - * @version $Id: TestSynonymFilter.java 1104519 2011-05-17 20:16:40Z rmuir $ + * @version $Id: TestSynonymFilter.java 1124321 2011-05-18 16:24:27Z simonw $ */ public class TestSynonymFilter extends BaseTokenStreamTestCase { diff -ruN -x .svn -x build trunk_2/solr/CHANGES.txt docvalues/solr/CHANGES.txt --- trunk_2/solr/CHANGES.txt 2011-06-04 00:09:29.000000000 +0200 +++ docvalues/solr/CHANGES.txt 2011-06-04 00:40:44.000000000 +0200 @@ -19,7 +19,7 @@ See the tutorial at http://lucene.apache.org/solr/tutorial.html -$Id: CHANGES.txt 1131228 2011-06-03 20:48:47Z yonik $ +$Id: CHANGES.txt 1131275 2011-06-03 22:40:42Z simonw $ ================== 4.0.0-dev ================== Versions of Major Components diff -ruN -x .svn -x build trunk_2/solr/contrib/dataimporthandler/CHANGES.txt docvalues/solr/contrib/dataimporthandler/CHANGES.txt --- trunk_2/solr/contrib/dataimporthandler/CHANGES.txt 2011-06-04 00:09:16.000000000 +0200 +++ docvalues/solr/contrib/dataimporthandler/CHANGES.txt 2011-06-03 22:43:25.000000000 +0200 @@ -7,7 +7,7 @@ HTTP data sources quick and easy. -$Id: CHANGES.txt 1129427 2011-05-30 23:11:10Z rmuir $ +$Id: CHANGES.txt 1129631 2011-05-31 11:25:37Z simonw $ ================== 4.0.0-dev ============== (No Changes) diff -ruN -x .svn -x build trunk_2/solr/contrib/dataimporthandler/README.txt docvalues/solr/contrib/dataimporthandler/README.txt --- trunk_2/solr/contrib/dataimporthandler/README.txt 2011-02-01 17:08:15.000000000 +0100 +++ docvalues/solr/contrib/dataimporthandler/README.txt 2011-02-09 11:14:02.000000000 +0100 @@ -1,7 +1,215 @@ -Although Solr strives to be agnostic of the Locale where the server is -running, some code paths in DataImportHandler are known to depend on the -System default Locale, Timezone, or Charset. It is recommended that when -running Solr you set the following system properties: - -Duser.language=xx -Duser.country=YY -Duser.timezone=ZZZ +package org.apache.lucene.index.codecs.preflexrw; -where xx, YY, and ZZZ are consistent with any database server's configuration. +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.preflex.TermInfo; +import org.apache.lucene.index.codecs.docvalues.DocValuesConsumer; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.util.Comparator; + +class PreFlexFieldsWriter extends FieldsConsumer { + + private final TermInfosWriter termsOut; + private final IndexOutput freqOut; + private final IndexOutput proxOut; + private final DefaultSkipListWriter skipListWriter; + private final int totalNumDocs; + + public PreFlexFieldsWriter(SegmentWriteState state) throws IOException { + termsOut = new TermInfosWriter(state.directory, + state.segmentName, + state.fieldInfos, + state.termIndexInterval); + + final String freqFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.FREQ_EXTENSION); + freqOut = state.directory.createOutput(freqFile); + totalNumDocs = state.numDocs; + + if (state.fieldInfos.hasProx()) { + final String proxFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.PROX_EXTENSION); + proxOut = state.directory.createOutput(proxFile); + } else { + proxOut = null; + } + + skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, + termsOut.maxSkipLevels, + totalNumDocs, + freqOut, + proxOut); + //System.out.println("\nw start seg=" + segment); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + assert field.number != -1; + //System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number); + return new PreFlexTermsWriter(field); + } + + @Override + public void close() throws IOException { + termsOut.close(); + freqOut.close(); + if (proxOut != null) { + proxOut.close(); + } + } + + private class PreFlexTermsWriter extends TermsConsumer { + private final FieldInfo fieldInfo; + private final boolean omitTF; + private final boolean storePayloads; + + private final TermInfo termInfo = new TermInfo(); + private final PostingsWriter postingsWriter = new PostingsWriter(); + + public PreFlexTermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + private class PostingsWriter extends PostingsConsumer { + private int lastDocID; + private int lastPayloadLength = -1; + private int lastPosition; + private int df; + + public PostingsWriter reset() { + df = 0; + lastDocID = 0; + lastPayloadLength = -1; + return this; + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + //System.out.println(" w doc=" + docID); + + final int delta = docID - lastDocID; + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % termsOut.skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + lastDocID = docID; + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + if (omitTF) { + freqOut.writeVInt(delta); + } else { + final int code = delta << 1; + if (termDocFreq == 1) { + freqOut.writeVInt(code|1); + } else { + freqOut.writeVInt(code); + freqOut.writeVInt(termDocFreq); + } + } + lastPosition = 0; + } + + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert proxOut != null; + + //System.out.println(" w pos=" + position + " payl=" + payload); + final int delta = position - lastPosition; + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + if (payloadLength != lastPayloadLength) { + //System.out.println(" write payload len=" + payloadLength); + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payload.length); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() throws IOException { + } + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + //System.out.println(" w term=" + text.utf8ToString()); + skipListWriter.resetSkip(); + termInfo.freqPointer = freqOut.getFilePointer(); + if (proxOut != null) { + termInfo.proxPointer = proxOut.getFilePointer(); + } + return postingsWriter.reset(); + } + + @Override + public void finishTerm(BytesRef text, int numDocs) throws IOException { + if (numDocs > 0) { + long skipPointer = skipListWriter.writeSkip(freqOut); + termInfo.docFreq = numDocs; + termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer); + //System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number); + termsOut.add(fieldInfo.number, + text, + termInfo); + } + } + + @Override + public void finish() throws IOException { + } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + } + + @Override + public DocValuesConsumer addValuesField(FieldInfo field) throws IOException { + //TODO(simonw): can we fix this easily? + throw new UnsupportedOperationException("no implemented"); + } +} \ No newline at end of file