Index: contrib/benchmark/conf/sort-standard.alg =================================================================== --- contrib/benchmark/conf/sort-standard.alg (revision 983076) +++ contrib/benchmark/conf/sort-standard.alg (working copy) @@ -26,6 +26,7 @@ directory=FSDirectory #directory=RamDirectory +doc.index.props=true doc.stored=true doc.tokenized=true doc.term.vector=false @@ -66,6 +67,4 @@ } : 4 } - -RepSumByName - +RepSumByName \ No newline at end of file Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (revision 983076) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (working copy) @@ -28,9 +28,12 @@ import org.apache.lucene.benchmark.byTask.utils.Format; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.ValuesAttribute; /** * Creates {@link Document} objects. Uses a {@link ContentSource} to generate @@ -95,9 +98,19 @@ // Initialize the map with the default fields. fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector)); - fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector)); + final Field f = new Field(TITLE_FIELD, "", store, index, termVector); + // nocommit need explicit per-field control + if (store == Field.Store.YES) { + f.attributes().addAttribute(ValuesAttribute.class).setType(Values.BYTES_VAR_SORTED); + } + fields.put(TITLE_FIELD, f); fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector)); - fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + final Field f2 = new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + // nocommit need explicit per-field control + if (store == Field.Store.YES) { + f2.attributes().addAttribute(ValuesAttribute.class).setType(Values.BYTES_VAR_SORTED); + } + fields.put(ID_FIELD, f2); fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector)); doc = new Document(); @@ -114,12 +127,29 @@ */ Field getField(String name, Store store, Index index, TermVector termVector) { if (!reuseFields) { - return new Field(name, "", store, index, termVector); + final Field f = new Field(name, "", store, index, termVector); + if (store == Field.Store.YES) { + // nocommit need explicit per-field control + if (name.equals("sort_field")) { + f.attributes().addAttribute(ValuesAttribute.class).setType(Values.PACKED_INTS_FIXED); + } else { + f.attributes().addAttribute(ValuesAttribute.class).setType(Values.BYTES_VAR_SORTED); + } + } + return f; } Field f = fields.get(name); if (f == null) { f = new Field(name, "", store, index, termVector); + if (store == Field.Store.YES) { + // nocommit need explicit per-field control + if (name.equals("sort_field")) { + f.attributes().addAttribute(ValuesAttribute.class).setType(Values.PACKED_INTS); + } else { + f.attributes().addAttribute(ValuesAttribute.class).setType(Values.BYTES_VAR_SORTED); + } + } fields.put(name, f); } return f; @@ -235,7 +265,7 @@ } } - //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); + // System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 983076) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy) @@ -139,7 +139,7 @@ for(int i=0;i'); return result.toString(); } + + + protected AttributeSource attributes; + + public AttributeSource attributes() { + if(attributes == null) + attributes = new AttributeSource(); + return attributes; + } + + public void setAttributeSource(AttributeSource source){ + attributes = source; + } } Index: src/java/org/apache/lucene/document/AttributeField.java =================================================================== --- src/java/org/apache/lucene/document/AttributeField.java (revision 0) +++ src/java/org/apache/lucene/document/AttributeField.java (revision 0) @@ -0,0 +1,186 @@ +package org.apache.lucene.document; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; + +/* + * nocommit - This class reflects my idea of utilizing Attributes to make Fields + * eventually extendible and maybe even easier to use. A document is still a + * collection of fields but a field is only tuple. Each + * Attribute serves a certain purpose like the ValuesAttribute for IndexValues + * and the example in InvertableValueAttribute (maybe shorter names are + * preferable). + * + * Both, the user of the API and the indexing code only ask for the attributes + * they are interested in like the part of the indexer who takes care of storing + * CSF only asks for the ValuesAttribute and so on. Similarly we could have a + * StoreAttribute and a BoostAttribute etc. + * + * To combine this with the idea of a FieldType we could differentiate between + * shared attributes (essentially a AttributeSource used across field instances) + * which holds "static" immutable attributes. Others are considered per field + * attributes like the InvertableValueAttribute, those are per field instance + * and are mutable. + * + * The user of the API are only exposed to the Attributes he really needs and + * more sophisticated features like TermVector are kind of hidden from them + * completely. + * + * This whole API is just an idea an I thought I should sketch it out in this + * feature - feedback is very very welcome. I expect people saying + * "users are used to Field and Document and all its setters and getters" - yeah + * I guess that is right. But for those not much will change really and if they + * wanna use the rest of the API attributes is something they should be familiar + * with anyway, right? + */ +public class AttributeField implements Fieldable { + + private String name; + + public AttributeField(String name) { + this.name = name; + } + + public Reader readerValue() { + final InvertableValueAttribute attr = getAttributeOrNull(InvertableValueAttribute.class); + return attr == null ? null : attr.reader(); + } + + public String stringValue() { + final InvertableValueAttribute attr = getAttributeOrNull(InvertableValueAttribute.class); + return attr == null ? null : attr.string(); + } + + public TokenStream tokenStreamValue() { + final InvertableValueAttribute attr = getAttributeOrNull(InvertableValueAttribute.class); + return attr == null ? null : attr.tokenStream(); + } + + private T getAttributeOrNull(Class attr) { + if (attributes.hasAttribute(attr)) { + return this.attributes.getAttribute(attr); + } + return null; + } + + private AttributeSource attributes; + + public AttributeSource attributes() { + if (attributes == null) + attributes = new AttributeSource(); + return attributes; + } + + public void setAttributeSource(AttributeSource source) { + attributes = source; + } + + public int getBinaryLength() { + // TODO Auto-generated method stub + return 0; + } + + public int getBinaryOffset() { + // TODO Auto-generated method stub + return 0; + } + + public byte[] getBinaryValue() { + // TODO Auto-generated method stub + return null; + } + + public byte[] getBinaryValue(byte[] result) { + // TODO Auto-generated method stub + return null; + } + + public float getBoost() { + // TODO Auto-generated method stub + return 0; + } + + public boolean getOmitNorms() { + // TODO Auto-generated method stub + return false; + } + + public boolean getOmitTermFreqAndPositions() { + // TODO Auto-generated method stub + return false; + } + + public boolean isBinary() { + // TODO Auto-generated method stub + return false; + } + + public boolean isIndexed() { + return attributes.hasAttribute(InvertableValueAttribute.class); + } + + public boolean isLazy() { + // TODO Auto-generated method stub + return false; + } + + public boolean isStoreOffsetWithTermVector() { + // TODO Auto-generated method stub + return false; + } + + public boolean isStorePositionWithTermVector() { + // TODO Auto-generated method stub + return false; + } + + public boolean isStored() { + // TODO Auto-generated method stub + return false; + } + + public boolean isTermVectorStored() { + // TODO Auto-generated method stub + return false; + } + + public boolean isTokenized() { + // TODO Auto-generated method stub + return false; + } + + public String name() { + return name; + } + + public void setBoost(float boost) { + } + + public void setOmitNorms(boolean omitNorms) { + + } + + public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { + + } + +} Property changes on: src/java/org/apache/lucene/document/AttributeField.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/document/Field.java =================================================================== --- src/java/org/apache/lucene/document/Field.java (revision 983076) +++ src/java/org/apache/lucene/document/Field.java (working copy) @@ -19,6 +19,10 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexWriter; // for javadoc +import org.apache.lucene.index.values.ValuesAttribute; +import org.apache.lucene.index.values.ValuesEnum.FloatsRef; +import org.apache.lucene.index.values.ValuesEnum.IntsRef; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; import java.io.Reader; @@ -293,6 +297,7 @@ throw new IllegalArgumentException("cannot set a String value on a binary field"); } fieldsData = value; + setIndexValues(); } /** Expert: change the value of this field. See setValue(String). */ @@ -314,6 +319,56 @@ fieldsData = value; binaryLength = value.length; binaryOffset = 0; + setIndexValues(); + + } + + private void setIndexValues() { + // nocommit -- this only works with the setter methods (with an already added ValuesAttribute) and its primary purpose it to enable DocMaker in benchmark to work + // once explicit field support is done this can go away. + if(isStored && attributes().hasAttribute(ValuesAttribute.class)) { + ValuesAttribute attribute = attributes.getAttribute(ValuesAttribute.class); + if(fieldsData instanceof byte[]){ + BytesRef bytes = attribute.bytes(); + if(bytes == null) { + throw new IllegalArgumentException("can not set binary value to IndexValue of type " + attribute.type()); + } + bytes.bytes = (byte[])fieldsData; + bytes.offset = binaryOffset; + bytes.length = binaryLength; + } else if (fieldsData instanceof String) { + String string = (String) fieldsData; + switch(attribute.type()){ + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + { + BytesRef ref = attribute.bytes(); + ref.copy(string); + } + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + { + IntsRef ref = attribute.ints(); + ref.set(Long.parseLong(string)); + } + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + { + FloatsRef ref = attribute.floats(); + ref.set(Double.parseDouble(string)); + } + break; + } + + } + + } } /** Expert: change the value of this field. See setValue(String). */ @@ -324,6 +379,7 @@ fieldsData = value; binaryLength = length; binaryOffset = offset; + setIndexValues(); } /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. Index: src/java/org/apache/lucene/document/Fieldable.java =================================================================== --- src/java/org/apache/lucene/document/Fieldable.java (revision 983076) +++ src/java/org/apache/lucene/document/Fieldable.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.FieldInvertState; // for javadocs import org.apache.lucene.search.PhraseQuery; // for javadocs import org.apache.lucene.search.spans.SpanQuery; // for javadocs +import org.apache.lucene.util.AttributeSource; import java.io.Reader; import java.io.Serializable; @@ -209,4 +210,6 @@ * silently fail to find results. */ void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + + AttributeSource attributes(); } Index: src/java/org/apache/lucene/document/InvertableValueAttribute.java =================================================================== --- src/java/org/apache/lucene/document/InvertableValueAttribute.java (revision 0) +++ src/java/org/apache/lucene/document/InvertableValueAttribute.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.document; +/** +* Copyright 2004 The Apache Software Foundation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.util.Attribute; + +// nocommit - this is just an example how Fields could utilize attibutes for extendibility and FieldType support see AttributeField for details +public interface InvertableValueAttribute extends Attribute{ + + TokenStream tokenStream(); + + String string(); + + Reader reader(); + + Index index(); + +} Property changes on: src/java/org/apache/lucene/document/InvertableValueAttribute.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/ByteBlockPool.java =================================================================== --- src/java/org/apache/lucene/index/ByteBlockPool.java (revision 983076) +++ src/java/org/apache/lucene/index/ByteBlockPool.java (working copy) @@ -39,12 +39,12 @@ import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; import org.apache.lucene.util.ArrayUtil; -final class ByteBlockPool { +public final class ByteBlockPool { - abstract static class Allocator { - abstract void recycleByteBlocks(byte[][] blocks, int start, int end); - abstract void recycleByteBlocks(List blocks); - abstract byte[] getByteBlock(); + public abstract static class Allocator { + public abstract void recycleByteBlocks(byte[][] blocks, int start, int end); + public abstract void recycleByteBlocks(List blocks); + public abstract byte[] getByteBlock(); } public byte[][] buffers = new byte[10][]; Index: src/java/org/apache/lucene/index/CompoundFileReader.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileReader.java (revision 983076) +++ src/java/org/apache/lucene/index/CompoundFileReader.java (working copy) @@ -157,7 +157,7 @@ throw new IOException("Stream closed"); id = IndexFileNames.stripSegmentName(id); - FileEntry entry = entries.get(id); + final FileEntry entry = entries.get(id); if (entry == null) throw new IOException("No sub-file with id " + id + " found"); Index: src/java/org/apache/lucene/index/CompoundFileWriter.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileWriter.java (revision 983076) +++ src/java/org/apache/lucene/index/CompoundFileWriter.java (working copy) @@ -49,9 +49,13 @@ */ final class CompoundFileWriter { - private static final class FileEntry { + static final class FileEntry { + + FileEntry(String file) { + this.file = file; + } /** source file */ - String file; + final String file; /** temporary holder for the start of directory entry for this file */ long directoryOffset; @@ -128,10 +132,7 @@ if (! ids.add(file)) throw new IllegalArgumentException( "File " + file + " already added"); - - FileEntry entry = new FileEntry(); - entry.file = file; - entries.add(entry); + entries.add(new FileEntry(file)); } /** Merge files with the extensions added up to now. Index: src/java/org/apache/lucene/index/DirectoryReader.java =================================================================== --- src/java/org/apache/lucene/index/DirectoryReader.java (revision 983076) +++ src/java/org/apache/lucene/index/DirectoryReader.java (working copy) @@ -36,6 +36,11 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.ValuesEnum; +import org.apache.lucene.index.values.Reader.Source; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.BytesRef; @@ -1014,7 +1019,264 @@ return commits; } + + public Reader getIndexValues(String field) { + ensureOpen(); + if (subReaders.length == 1) { + return subReaders[0].getIndexValues(field); + } + return new MultiValueReader(field); + } + + private class MultiValueReader extends Reader { + + private String id; + private Values value; + + public MultiValueReader(String id) { + this.id = id; + for (SegmentReader reader : subReaders) { + FieldInfo fieldInfo = reader.fieldInfos().fieldInfo(id); + if(fieldInfo != null){ + value = fieldInfo.getIndexValues(); + break; + } + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new MultiValuesEnum(id, value); + } + + @Override + public Source load() throws IOException { + return new MultiSource(id); + } + + public void close() throws IOException { + // + } + + } + + private class MultiValuesEnum extends ValuesEnum { + private int numDocs_ = 0; + private int pos = -1; + private int start = 0; + private final String id; + private final ValuesEnum[] enumCache; + private ValuesEnum current; + + protected MultiValuesEnum(String id, Values enumType) { + super(enumType); + enumCache = new ValuesEnum[subReaders.length]; + this.id = id; + } + + @Override + public void close() throws IOException { + for (ValuesEnum valuesEnum : enumCache) { + if(valuesEnum != null) + valuesEnum.close(); + } + } + + @Override + public int advance( int target) throws IOException { + int n = target - start; + do { + if(target >= maxDoc) + return pos = NO_MORE_DOCS; + if (n >= numDocs_) { + int idx = readerIndex(target); + if (enumCache[idx] == null) { + try { + Reader indexValues = subReaders[idx].getIndexValues(id); + if (indexValues != null) // nocommit does that work with default + // values? + enumCache[idx] = indexValues.getEnum(this.attributes()); + else + enumCache[idx] = new DummyEnum(this.attributes(), + subReaders[idx].maxDoc(), attr.type()); + } catch (IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + } + current = enumCache[idx]; + start = starts[idx]; + numDocs_ = subReaders[idx].maxDoc(); + n = target - start; + } + target = start+numDocs_; + } while ((n = current.advance(n)) == NO_MORE_DOCS); + return pos = start+current.docID(); + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + + private class MultiSource extends Source { + private int numDocs_ = 0; + private int start = 0; + private Source current; + private final String id; + + MultiSource(String id) { + this.id = id; + } + + public long ints(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getInts(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + start = starts[idx]; + numDocs_ = subReaders[idx].maxDoc(); + n = docID - start; + } + return current.ints(n); + } + + public double floats(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getFloats(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + numDocs_ = subReaders[idx].maxDoc(); + + start = starts[idx]; + n = docID - start; + } + return current.floats(n); + } + + public BytesRef bytes(int docID) { + int n = docID - start; + if(n >= numDocs_) { + int idx = readerIndex(docID); + try{ + current = subReaders[idx].getIndexValuesCache().getBytes(id); + if(current == null) //nocommit does that work with default values? + current = new DummySource(); + }catch(IOException ex) { + // nocommit what to do here? + throw new RuntimeException(ex); + } + numDocs_ = subReaders[idx].maxDoc(); + start = starts[idx]; + n = docID - start; + } + return current.bytes(n); + } + + public long ramBytesUsed() { + return current.ramBytesUsed(); + } + + } + + private static class DummySource extends Source { + private final BytesRef ref = new BytesRef(); + @Override + public BytesRef bytes(int docID) { + return ref; + } + + + @Override + public double floats(int docID) { + return 0.0d; + } + + @Override + public long ints(int docID) { + return 0; + } + + public long ramBytesUsed() { + return 0; + } + } + + private static class DummyEnum extends ValuesEnum { + private int pos = -1; + private final int maxDoc; + + public DummyEnum(AttributeSource source, int maxDoc, Values type) { + super(source, type); + this.maxDoc = maxDoc; + switch (type) { + case BYTES_VAR_STRAIGHT: + case BYTES_FIXED_STRAIGHT: + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + // nocommit - this is not correct for Fixed_straight + BytesRef bytes = attr.bytes(); + bytes.length = 0; + bytes.offset = 0; + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + IntsRef ints = attr.ints(); + ints.set(0); + break; + + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + FloatsRef floats = attr.floats(); + floats.set(0d); + break; + default: + throw new IllegalArgumentException("unknown Values type: " + type); + } + } + @Override + public void close() throws IOException { + } + + @Override + public int advance(int target) throws IOException { + return pos = (pos < maxDoc ? target: NO_MORE_DOCS); + } + @Override + public int docID() { + return pos; + } + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + + } + + private static final class ReaderCommit extends IndexCommit { private String segmentsFileName; Collection files; Index: src/java/org/apache/lucene/index/DocFieldProcessor.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessor.java (revision 983076) +++ src/java/org/apache/lucene/index/DocFieldProcessor.java (working copy) @@ -17,8 +17,19 @@ * limitations under the License. */ +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.ValuesAttribute; +import org.apache.lucene.index.values.Writer; +import org.apache.lucene.index.values.ValuesEnum.FloatsRef; +import org.apache.lucene.index.values.ValuesEnum.IntsRef; +import org.apache.lucene.util.BytesRef; + import java.io.IOException; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.HashMap; @@ -37,6 +48,153 @@ final FieldInfos fieldInfos = new FieldInfos(); final DocFieldConsumer consumer; final StoredFieldsWriter fieldsWriter; + final private Map indexValues = new HashMap(); + + synchronized IndexValuesProcessor getProcessor(Directory dir, String segment, String name, ValuesAttribute attr, FieldInfo fieldInfo) + throws IOException { + if(attr == null) + return null; + IndexValuesProcessor p = indexValues.get(name); + if (p == null) { + org.apache.lucene.index.values.Values v = attr.type(); + final String id = segment + "_" + fieldInfo.number; + switch(v) { + case PACKED_INTS: + p = new IntValuesProcessor(dir, id, false); + break; + case PACKED_INTS_FIXED: + p = new IntValuesProcessor(dir, id, true); + break; + case SIMPLE_FLOAT_4BYTE: + p = new FloatValuesProcessor(dir, id, 4); + break; + case SIMPLE_FLOAT_8BYTE: + p = new FloatValuesProcessor(dir, id, 8); + break; + case BYTES_FIXED_STRAIGHT: + p = new BytesValuesProcessor(dir, id, true, null, Bytes.Mode.STRAIGHT); + break; + case BYTES_FIXED_DEREF: + p = new BytesValuesProcessor(dir, id, true, null, Bytes.Mode.DEREF); + break; + case BYTES_FIXED_SORTED: + p = new BytesValuesProcessor(dir, id, true, attr.bytesComparator(), Bytes.Mode.SORTED); + break; + case BYTES_VAR_STRAIGHT: + p = new BytesValuesProcessor(dir, id, false, null, Bytes.Mode.STRAIGHT); + break; + case BYTES_VAR_DEREF: + p = new BytesValuesProcessor(dir, id, false, null, Bytes.Mode.DEREF); + break; + case BYTES_VAR_SORTED: + p = new BytesValuesProcessor(dir, id, false, attr.bytesComparator(), Bytes.Mode.SORTED); + break; + } + fieldInfo.setIndexValues(v); + indexValues.put(name, p); + } + + return p; + } + + static abstract class IndexValuesProcessor { + public abstract void add(int docID, String name, ValuesAttribute attr) throws IOException; + public abstract void finish(int docCount) throws IOException; + public abstract void files(Collection files) throws IOException; + } + + static class FloatValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + + public FloatValuesProcessor(Directory dir, String id, int precision) throws IOException { + this.id = id; + writer = Floats.getWriter(dir, id, precision); + } + + @Override + public void add(int docID, String name, ValuesAttribute attr) throws IOException { + final FloatsRef floats = attr.floats(); + if(floats != null) { + writer.add(docID, floats.get()); + return; + } + throw new IllegalArgumentException("could not extract float/double from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) { + Floats.files(id, files); + } + } + + static class IntValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + + public IntValuesProcessor(Directory dir, String id, boolean fixedArray) throws IOException { + this.id = id; + writer = Ints.getWriter(dir, id, fixedArray); + } + + @Override + public void add(int docID, String name, ValuesAttribute attr) throws IOException { + final IntsRef ints = attr.ints(); + if(ints != null) { + writer.add(docID, ints.get()); + return; + } + throw new IllegalArgumentException("could not extract int/long from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) throws IOException { + Ints.files(id, files); + } + } + + static class BytesValuesProcessor extends IndexValuesProcessor { + private final Writer writer; + private final String id; + private final Directory dir; + + public BytesValuesProcessor(Directory dir, String id, boolean fixedSize, Comparator comp, Bytes.Mode mode) throws IOException { + this.id = id; + writer = Bytes.getWriter(dir, id, mode,comp, fixedSize); + this.dir = dir; + } + + // nocommit -- make this thread private and not sync'd + @Override + public synchronized void add(int docID, String name, ValuesAttribute attr) throws IOException { + final BytesRef bytes = attr.bytes(); + if(bytes != null) { + writer.add(docID, bytes); + return; + } + throw new IllegalArgumentException("could not extract byte[] from field " + name); + } + + @Override + public void finish(int docCount) throws IOException { + writer.finish(docCount); + } + + @Override + public void files(Collection files) throws IOException { + Bytes.files(dir, id, files); + } + } public DocFieldProcessor(DocumentsWriter docWriter, DocFieldConsumer consumer) { this.docWriter = docWriter; @@ -63,6 +221,14 @@ fieldsWriter.flush(state); consumer.flush(childThreadsAndFields, state); + for(IndexValuesProcessor p : indexValues.values()) { + if (p != null) { + p.finish(state.numDocs); + p.files(state.flushedFiles); + } + } + indexValues.clear(); + // Important to save after asking consumer to flush so // consumer can alter the FieldInfo* if necessary. EG, // FreqProxTermsWriter does this with Index: src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 983076) +++ src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (working copy) @@ -20,10 +20,14 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.Map.Entry; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.values.ValuesAttribute; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.RamUsageEstimator; /** @@ -243,10 +247,24 @@ // enabled; we could save [small amount of] CPU // here. quickSort(fields, 0, fieldCount-1); - - for(int i=0;i blocks) { + public void recycleByteBlocks(List blocks) { synchronized(DocumentsWriter.this) { final int size = blocks.size(); for(int i=0;i= YES || dir.fileExists(delFileName))) { fileSet.add(delFileName); } - + //nocommit - is there a better way to get all the dat / idx files? + for(String file : dir.listAll()) { + if(file.startsWith(name) && (file.endsWith("dat") || file.endsWith("idx"))){ + fileSet.add(file); + } + } if (normGen != null) { for (int i = 0; i < normGen.length; i++) { long gen = normGen[i]; Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 983076) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -31,6 +31,12 @@ import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.MergeState; import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Values; +import org.apache.lucene.index.values.Writer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -157,6 +163,8 @@ if (mergeDocStores && fieldInfos.hasVectors()) mergeVectors(); + mergeIndexValues(); + return mergedDocs; } @@ -170,6 +178,12 @@ reader.close(); } } + + private void addIfExists(Set files, String file, Directory dir) throws IOException{ + if(dir.fileExists(file)){ + files.add(file); + } + } final List createCompoundFile(String fileName, final SegmentInfo info) throws IOException { @@ -183,13 +197,20 @@ !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) fileSet.add(IndexFileNames.segmentFileName(segment, "", ext)); } - codec.files(directory, info, fileSet); // Fieldable norm files - int numFIs = fieldInfos.size(); + final int numFIs = fieldInfos.size(); for (int i = 0; i < numFIs; i++) { - FieldInfo fi = fieldInfos.fieldInfo(i); + final FieldInfo fi = fieldInfos.fieldInfo(i); + // Index Values aka. CSF + if (fi.indexValues != null) { + addIfExists(fileSet, IndexFileNames.segmentFileName(segment, Integer + .toString(fi.number), IndexFileNames.CSF_DATA_EXTENSION), directory); + addIfExists(fileSet, IndexFileNames.segmentFileName(segment, Integer + .toString(fi.number), IndexFileNames.CSF_INDEX_EXTENSION), + directory); + } if (fi.isIndexed && !fi.omitNorms) { fileSet.add(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION)); break; @@ -288,10 +309,18 @@ int numReaderFieldInfos = readerFieldInfos.size(); for (int j = 0; j < numReaderFieldInfos; j++) { FieldInfo fi = readerFieldInfos.fieldInfo(j); - fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, - fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, - !reader.hasNorms(fi.name), fi.storePayloads, - fi.omitTermFreqAndPositions); + FieldInfo merged = fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, + fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, + !reader.hasNorms(fi.name), fi.storePayloads, + fi.omitTermFreqAndPositions); + final Values fiIndexValues = fi.indexValues; + final Values mergedIndexValues = merged.indexValues; + if (mergedIndexValues == null) { + merged.setIndexValues(fiIndexValues); + } else if (mergedIndexValues != fiIndexValues) { + // nocommit -- what to do? + throw new IllegalStateException("cannot merge field " + fi.name + " indexValues changed from " + mergedIndexValues + " to " + fiIndexValues); + } } } else { addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); @@ -302,6 +331,8 @@ addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false); + + // nocommit -- how should we handle index values here? } } fieldInfos.write(directory, segment + ".fnm"); @@ -362,6 +393,77 @@ return docCount; } + private void mergeIndexValues() throws IOException { + final int numFields = fieldInfos.size(); + for (int i = 0; i < numFields; i++) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(i); + final Values v = fieldInfo.indexValues; + // nocommit we need some kind of compatibility notation for values such + // that two slighly different segments can be merged eg. fixed vs. + // variable byte len or float32 vs. float64 + + if (v != null) { + int docBase = 0; + final List mergeStates = new ArrayList(); + for (IndexReader reader : readers) { + Reader r = reader.getIndexValues(fieldInfo.name); + if (r != null) { + mergeStates.add(new Writer.MergeState(r, docBase, reader + .maxDoc(), reader.getDeletedDocs())); + } + docBase += reader.numDocs(); + } + if (mergeStates.isEmpty()) { + continue; + } + final String id = segment + "_" + fieldInfo.number; + final Writer writer; + switch (v) { + case PACKED_INTS: + case PACKED_INTS_FIXED: + writer = Ints.getWriter(directory, id, true); + break; + case SIMPLE_FLOAT_4BYTE: + writer = Floats.getWriter(directory, id, 4); + break; + case SIMPLE_FLOAT_8BYTE: + writer = Floats.getWriter(directory, id, 8); + break; + case BYTES_FIXED_STRAIGHT: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.STRAIGHT, null, true); + break; + case BYTES_FIXED_DEREF: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.DEREF, null, true); + break; + case BYTES_FIXED_SORTED: + // nocommit -- enable setting Comparator + writer = Bytes.getWriter(directory, id, + Bytes.Mode.SORTED, null, true); + break; + case BYTES_VAR_STRAIGHT: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.STRAIGHT, null, false); + break; + case BYTES_VAR_DEREF: + writer = Bytes.getWriter(directory, id, + Bytes.Mode.DEREF, null, false); + break; + case BYTES_VAR_SORTED: + // nocommit -- enable setting Comparator + writer = Bytes.getWriter(directory, id, + Bytes.Mode.SORTED, null, false); + break; + default: + continue; + } + writer.add(mergeStates); + writer.finish(mergedDocs); + } + } + } + private int copyFieldsWithDeletions(final FieldsWriter fieldsWriter, final IndexReader reader, final FieldsReader matchingFieldsReader) throws IOException, MergeAbortedException, CorruptIndexException { Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 983076) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -30,6 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.BufferedIndexInput; @@ -41,6 +42,11 @@ import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.values.Bytes; +import org.apache.lucene.index.values.Ints; +import org.apache.lucene.index.values.Reader; +import org.apache.lucene.index.values.Floats; +import org.apache.lucene.index.values.Values; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close import org.apache.lucene.util.BytesRef; @@ -134,7 +140,7 @@ // Ask codec for its Fields fields = si.getCodec().fieldsProducer(new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor)); assert fields != null; - + openIndexValuesReaders(cfsDir, si); success = true; } finally { if (!success) { @@ -149,6 +155,57 @@ this.origInstance = origInstance; } + final Map indexValues = new HashMap(); + + // Only opens files... doesn't actually load any values + private void openIndexValuesReaders(Directory dir, SegmentInfo si) throws IOException { + final int numFields = fieldInfos.size(); + for(int i=0;i + * NOTE: The total amount of byte[] data stored (across a single segment) cannot + * exceed 2GB. + *

+ *

+ * NOTE: Each byte[] must be <= 32768 bytes in length + *

+ */ +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public final class Bytes { + + // don't instantiate! + private Bytes() { + } + + public static enum Mode { + STRAIGHT, DEREF, SORTED + }; + + public static void files(Directory dir, String id, Collection files) + throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + final String idxFile = IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION); + if (dir.fileExists(idxFile)) { + files.add(idxFile); + } + } + + // nocommit -- i shouldn't have to specify fixed? can + // track itself & do the write thing at write time? + public static Writer getWriter(Directory dir, String id, Mode mode, + Comparator comp, boolean fixedSize) throws IOException { + + if (comp == null) { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + return new FixedStraightBytesImpl.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new FixedDerefBytesImpl.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Writer(dir, id, comp); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Writer(dir, id); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Writer(dir, id); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Writer(dir, id, comp); + } + } + + throw new IllegalArgumentException(""); + } + + // nocommit -- I can peek @ header to determing fixed/mode? + public static Reader getReader(Directory dir, String id, Mode mode, + boolean fixedSize, int maxDoc) throws IOException { + if (fixedSize) { + if (mode == Mode.STRAIGHT) { + try { + return new FixedStraightBytesImpl.Reader(dir, id, maxDoc); + } catch (IOException e) { + throw e; + } + } else if (mode == Mode.DEREF) { + try { + return new FixedDerefBytesImpl.Reader(dir, id, maxDoc); + } catch (IOException e) { + throw e; + } + } else if (mode == Mode.SORTED) { + return new FixedSortedBytesImpl.Reader(dir, id, maxDoc); + } + } else { + if (mode == Mode.STRAIGHT) { + return new VarStraightBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.DEREF) { + return new VarDerefBytesImpl.Reader(dir, id, maxDoc); + } else if (mode == Mode.SORTED) { + return new VarSortedBytesImpl.Reader(dir, id, maxDoc); + } + } + + throw new IllegalArgumentException(""); + } + + static abstract class BytesBaseSource extends Source { + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final BytesRef defaultValue = new BytesRef(); + + protected BytesBaseSource(IndexInput datIn, IndexInput idxIn) { + this.datIn = datIn; + this.idxIn = idxIn; + } + + public void close() throws IOException { + if (datIn != null) + datIn.close(); + if (idxIn != null) // if straight + idxIn.close(); + + } + } + + static abstract class BytesBaseSortedSource extends SortedSource { + protected final IndexInput datIn; + protected final IndexInput idxIn; + protected final BytesRef defaultValue = new BytesRef(); + + protected BytesBaseSortedSource(IndexInput datIn, IndexInput idxIn) { + this.datIn = datIn; + this.idxIn = idxIn; + } + + public void close() throws IOException { + if (datIn != null) + datIn.close(); + if (idxIn != null) // if straight + idxIn.close(); + + } + } + + static abstract class BytesWriterBase extends Writer { + + private final Directory dir; + private final String id; + protected IndexOutput idxOut; + protected IndexOutput datOut; + protected BytesRef bytesRef; + private String codecName; + private int version; + + protected BytesWriterBase(Directory dir, String id, String codecName, + int version, boolean initIndex, boolean initData) throws IOException { + this.dir = dir; + this.id = id; + this.codecName = codecName; + this.version = version; + if (initData) + initDataOut(); + if (initIndex) + initIndexOut(); + } + + protected void initDataOut() throws IOException { + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, codecName, version); + } + + protected void initIndexOut() throws IOException { + idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION)); + CodecUtil.writeHeader(idxOut, codecName, version); + } + + /** + * Must be called only with increasing docIDs. It's OK for some docIDs to be + * skipped; they will be filled with 0 bytes. + */ + @Override + public abstract void add(int docID, BytesRef bytes) throws IOException; + + @Override + public synchronized void finish(int docCount) throws IOException { + if (datOut != null) + datOut.close(); + if (idxOut != null) + idxOut.close(); + } + + @Override + protected void add(int docID) throws IOException { + add(docID, bytesRef); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + bytesRef = attr.bytes(); + assert bytesRef != null; + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static abstract class BytesReaderBase extends Reader { + protected final IndexInput idxIn; + protected final IndexInput datIn; + protected final int version; + protected final String id; + + protected BytesReaderBase(Directory dir, String id, String codecName, + int maxVersion, boolean doIndex) throws IOException { + this.id = id; + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + version = CodecUtil.checkHeader(datIn, codecName, maxVersion, maxVersion); + + if (doIndex) { + idxIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_INDEX_EXTENSION)); + final int version2 = CodecUtil.checkHeader(idxIn, codecName, + maxVersion, maxVersion); + assert version == version2; + } else { + idxIn = null; + } + } + + protected final IndexInput cloneData() { + // is never NULL + return (IndexInput) datIn.clone(); + } + + protected final IndexInput cloneIndex() { + return idxIn == null ? null : (IndexInput) idxIn.clone(); + } + + public void close() throws IOException { + if (datIn != null) { + datIn.close(); + } + if (idxIn != null) { + idxIn.close(); + } + } + } + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/Bytes.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Cache.java =================================================================== --- src/java/org/apache/lucene/index/values/Cache.java (revision 0) +++ src/java/org/apache/lucene/index/values/Cache.java (revision 0) @@ -0,0 +1,116 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.Reader.SortedSource; +import org.apache.lucene.index.values.Reader.Source; +import org.apache.lucene.util.BytesRef; + +public class Cache { + final IndexReader r; + // TODO(simonw): use WeakHashMaps instead here? + final Map ints = new HashMap(); + final Map floats = new HashMap(); + final Map bytes = new HashMap(); + final Map sortedBytes = new HashMap(); + + public Cache(IndexReader r) { + this.r = r; + } + + synchronized public Source getInts(String id) throws IOException { + Source s = ints.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + ints.put(id, s); + } + + return s; + } + + synchronized public Source getFloats(String id) throws IOException { + Source s = floats.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + floats.put(id, s); + } + + return s; + } + + synchronized public SortedSource getSortedBytes(String id, + Comparator comp) throws IOException { + SortedSource s = sortedBytes.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.loadSorted(comp); + sortedBytes.put(id, s); + } else { + // TODO(simonw): verify comp is the same! + } + + return s; + } + + synchronized public Source getBytes(String id) throws IOException { + Source s = bytes.get(id); + if (s == null) { + final Reader indexValues = r.getIndexValues(id); + if (indexValues == null) { + return null; + } + s = indexValues.load(); + bytes.put(id, s); + } + + return s; + } + + public void purgeInts(String id) { + ints.remove(id); + } + + public void purgeFloats(String id) { + floats.remove(id); + } + + public void purgeBytes(String id) { + bytes.remove(id); + } + + public void purgeSortedBytes(String id) { + sortedBytes.remove(id); + } +} Property changes on: src/java/org/apache/lucene/index/values/Cache.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java (revision 0) @@ -0,0 +1,270 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesHash; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedInts.PackedIntEnum; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +class FixedDerefBytesImpl { + + static final String CODEC_NAME = "FixedDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private int idUpto = 1; + private int[] docToID; + + final class Entry extends BytesHash.Entry { + int id; + + public Entry() { + id = idUpto++; + } + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected FixedDerefBytesImpl.Writer.Entry newEntry() { + return new FixedDerefBytesImpl.Writer.Entry(); + } + + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + docToID = new int[1]; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) // default value - skip it + return; + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + + " but got " + bytes.length); + } + final int idUptoStart = idUpto; + final Entry e = hash.add(bytes); + + if (e.id == idUptoStart) { + // new added entry + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + if (docID >= docToID.length) { + docToID = ArrayUtil.grow(docToID, 1 + docID); + } + docToID[docID] = e.id; + } + + synchronized public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_INT * docToID.length + + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) // no added data + return; + initIndexOut(); + idxOut.writeInt(idUpto - 1); + // write index + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(idUpto - 1)); + final int limit = docCount > docToID.length ? docToID.length : docCount; + for (int i = 0; i < limit; i++) { + w.add(docToID[i]); + } + // fill up remaining doc with zeros + for (int i = limit; i < docCount; i++) { + w.add(0); + } + w.finish(); + super.finish(docCount); + } + } + + public static class Reader extends BytesReaderBase { + private final int size; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + try { + size = datIn.readInt(); + } catch (IOException e) { + System.out.println(id); + throw e; + } + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), cloneIndex(), size); + } + + private static class Source extends BytesBaseSource { + // TODO: paged data or mmap? + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader index; + private final int numValue; + private final int size; + + protected Source(IndexInput datIn, IndexInput idxIn, int size) + throws IOException { + super(datIn, idxIn); + this.size = size; + numValue = idxIn.readInt(); + data = new byte[size * numValue]; + datIn.readBytes(data, 0, size * numValue); + index = PackedInts.getReader(idxIn); + bytesRef.bytes = data; + bytesRef.length = size; + } + + @Override + public BytesRef bytes(int docID) { + final int id = (int) index.get(docID); + if (id == 0) { + return defaultValue; + } + bytesRef.offset = (int) ((id - 1) * size); + return bytesRef; + } + + public long ramBytesUsed() { + // TODO(simonw): move ram calculation to PackedInts?! + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + index + .getBitsPerValue() + * index.size()); + } + + @Override + public int getValueCount() { + return numValue; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new DerefBytesEnum(source, cloneData(), cloneIndex(), CODEC_NAME, + size); + } + + static class DerefBytesEnum extends ValuesEnum { + protected final IndexInput datIn; + private final PackedIntEnum idx; + protected final long fp; + private final int size; + protected final BytesRef ref; + private final int valueCount; + private int pos = -1; + + public DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, String codecName, int size) throws IOException { + this(source, datIn, idxIn, codecName, size, Values.BYTES_FIXED_DEREF); + } + + protected DerefBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn, String codecName, int size, Values enumType) + throws IOException { + super(source, enumType); + ref = attr.bytes(); + this.datIn = datIn; + this.size = size == -1 ? 128 : size; + idxIn.readInt();// read valueCount + idx = PackedInts.getEnum(idxIn); + fp = datIn.getFilePointer(); + ref.grow(this.size); + ref.length = this.size; + ref.offset = 0; + valueCount = idx.size(); + } + + @Override + public int advance(int target) throws IOException { + if (target < valueCount) { + final long address = idx.advance(target); + pos = idx.docID(); + if(address == 0) { + // default is empty + ref.length = 0; + ref.offset = 0; + return pos; + } + fill(address, ref); + return pos; + } + return pos = NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + + public void close() throws IOException { + datIn.close(); + idx.close(); + } + + protected void fill(long address, BytesRef ref) throws IOException { + datIn.seek(fp + ((address - 1) * size)); + datIn.readBytes(ref.bytes, 0, size); + ref.length = size; + ref.offset = 0; + } + + @Override + public int docID() { + return pos; + } + + } + } + +} Property changes on: src/java/org/apache/lucene/index/values/FixedDerefBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java (revision 0) @@ -0,0 +1,253 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.values.Bytes.BytesBaseSortedSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesHash; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +// Stores fixed-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] + +class FixedSortedBytesImpl { + + static final String CODEC_NAME = "FixedSortedBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + private Entry[] docToEntry; + private final Comparator comp; + + final static class Entry extends BytesHash.Entry { + int address; + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected FixedSortedBytesImpl.Writer.Entry newEntry() { + return new FixedSortedBytesImpl.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id, Comparator comp) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + docToEntry = new Entry[1]; + this.comp = comp; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default - skip it + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + if (docID >= docToEntry.length) { + Entry[] newArray = new Entry[ArrayUtil.oversize(1+docID, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + docToEntry = newArray; + } + docToEntry[docID] = hash.add(bytes); + } + + synchronized public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJ_REF * docToEntry.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if(datOut == null)// no data added + return; + initIndexOut(); + Entry[] sortedEntries = hash.sort(comp); + final int count = hash.size(); + + // first dump bytes data, recording address as we go + for(int i=0;i docToEntry.length) { + limit = docToEntry.length; + } else { + limit = docCount; + } + for(int i=0;i comp) throws IOException { + return new Source(cloneData(), cloneIndex(), size, comp); + } + + private static class Source extends BytesBaseSortedSource { + + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader index; + private final LookupResult lookupResult = new LookupResult(); + private final int numValue; + private final Comparator comp; + private final int size; + + public Source(IndexInput datIn, IndexInput idxIn, int size, Comparator comp) throws IOException { + super(datIn, idxIn); + this.size = size; + datIn.seek(CodecUtil.headerLength(CODEC_NAME) + 4); + idxIn.seek(CodecUtil.headerLength(CODEC_NAME)); + + numValue = idxIn.readInt(); + data = new byte[size*numValue]; + datIn.readBytes(data, 0, size*numValue); + datIn.close(); + + index = PackedInts.getReader(idxIn); + idxIn.close(); // do we need to close that here? + + bytesRef.bytes = data; + bytesRef.length = size; + // default byte sort order + this.comp = comp==null?BytesRef.getUTF8SortedAsUnicodeComparator():comp; + } + + @Override + public int ord(int docID) { + return (int) index.get(docID); + } + + @Override + public BytesRef getByOrd(int ord) { + if (ord == 0) { + return defaultValue; + } else { + bytesRef.offset = (int) ((ord-1) * size); + return bytesRef; + } + } + + @Override + public LookupResult getByValue(BytesRef bytes) { + return binarySearch(bytes, 0, numValue-1); + } + + public long ramBytesUsed() { + // TODO(simonw): move ram calcultation to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + index.getBitsPerValue() * index.size()); + } + + @Override + public int getValueCount() { + return numValue; + } + + private LookupResult binarySearch(BytesRef b, int low, int high) { + + while (low <= high) { + int mid = (low + high) >>> 1; + bytesRef.offset = mid * size; + int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid+1; + lookupResult.found = true; + return lookupResult; + } + } + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + // do unsorted + return new DerefBytesEnum(source, cloneData(), cloneIndex(), CODEC_NAME, size); + } + } +} Property changes on: src/java/org/apache/lucene/index/values/FixedSortedBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java (revision 0) @@ -0,0 +1,221 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; + +// Simplest storage: stores fixed length byte[] per +// document, with no dedup and no sorting. + +class FixedStraightBytesImpl { + + static final String CODEC_NAME = "FixedStraightBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int size = -1; + // start at -1 if the first added value is > 0 + private int lastDocID = -1; + private byte[] oneRecord; + + protected Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + } + + // nocommit - impl bulk copy here! + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if (size == -1) { + size = bytes.length; + initDataOut(); + datOut.writeInt(size); + oneRecord = new byte[size]; + } else if (bytes.length != size) { + throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); + } + fill(docID); + assert bytes.bytes.length >= bytes.length; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } + + /* (non-Javadoc) + * @see org.apache.lucene.index.values.Writer#merge(org.apache.lucene.index.values.Writer.MergeState) + */ + @Override + protected void merge(MergeState state) throws IOException { + if(state.bits == null && state.reader instanceof Reader){ + Reader reader = (Reader) state.reader; + final int maxDocs = reader.maxDoc; + if(maxDocs == 0) + return; + if(size == -1) { + size = reader.size; + initDataOut(); + datOut.writeInt(size); + oneRecord = new byte[size]; + } + fill(state.docBase); + // nocommit should we add a transfer to API to each reader? + datOut.copyBytes(reader.cloneData(), size * maxDocs); + lastDocID += maxDocs-1; + } else + super.merge(state); + } + + // Fills up to but not including this docID + private void fill(int docID) throws IOException { + assert size >= 0; + for(int i=lastDocID+1;i= maxDoc){ + ref.length = 0; + ref.offset = 0; + return pos = NO_MORE_DOCS; + } + if((target-1) != pos) // pos inc == 1 + datIn.seek(fp + target * size); + datIn.readBytes(ref.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/FixedStraightBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Floats.java =================================================================== --- src/java/org/apache/lucene/index/values/Floats.java (revision 0) +++ src/java/org/apache/lucene/index/values/Floats.java (revision 0) @@ -0,0 +1,389 @@ +package org.apache.lucene.index.values; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.DoubleBuffer; +import java.nio.FloatBuffer; +import java.util.Collection; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.ValuesEnum.FloatsRef; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Exposes writer/reader for floating point values. You can specify 4 (java + * float) or 8 (java double) byte precision. + */ +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public class Floats { + private static final String CODEC_NAME = "SimpleFloats"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + private static final int INT_ZERO = Float.floatToRawIntBits(0.0f); + private static final long LONG_ZERO = Double.doubleToRawLongBits(0.0); + + public static void files(String id, Collection files) { + files.add(id + "." + IndexFileNames.CSF_DATA_EXTENSION); + } + + public static Writer getWriter(Directory dir, String id, int precisionBytes) + throws IOException { + if (precisionBytes != 4 && precisionBytes != 8) { + throw new IllegalArgumentException("precisionBytes must be 4 or 8; got " + + precisionBytes); + } + if (precisionBytes == 4) { + return new Float4Writer(dir, id); + } else { + return new Float8Writer(dir, id); + } + } + + public static Reader getReader(Directory dir, String id, int maxDoc) + throws IOException { + return new FloatsReader(dir, id, maxDoc); + } + + abstract static class FloatsWriter extends Writer { + private final Directory dir; + private final String id; + private FloatsRef floatsRef; + protected int lastDocId = -1; + protected IndexOutput datOut; + private final byte precision; + + protected FloatsWriter(Directory dir, String id, int precision) + throws IOException { + this.dir = dir; + this.id = id; + this.precision = (byte) precision; + } + + protected void initDatOut() throws IOException { + datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + assert datOut.getFilePointer() == CodecUtil.headerLength(CODEC_NAME); + datOut.writeByte(precision); + } + + public long ramBytesUsed() { + return 0; + } + + @Override + protected void add(int docID) throws IOException { + add(docID, floatsRef.get()); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + floatsRef = attr.floats(); + } + + protected abstract int fillDefault(int num) throws IOException; + + @Override + protected void merge(MergeState state) throws IOException { + if (state.bits == null && state.reader instanceof FloatsReader) { + // no deletes - bulk copy + // nocommit - should be do bulks with deletes too? + final FloatsReader reader = (FloatsReader) state.reader; + assert reader.precisionBytes == (int) precision; + if (reader.maxDoc == 0) + return; + if (datOut == null) + initDatOut(); + final int docBase = state.docBase; + if (docBase - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docBase - lastDocId - 1); + } + lastDocId += reader.transferTo(datOut); + } else + super.merge(state); + } + + } + + // Writes 4 bytes (float) per value + static class Float4Writer extends FloatsWriter { + + protected Float4Writer(Directory dir, String id) throws IOException { + super(dir, id, 4); + } + + @Override + synchronized public void add(final int docID, final double v) + throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (datOut == null) { + initDatOut(); + } + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeInt(Float.floatToRawIntBits((float) v)); + ++lastDocId; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; // no data added - don't create file! + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeInt(INT_ZERO); // default value + } + datOut.close(); + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeInt(INT_ZERO); + } + return numValues; + } + } + + // Writes 8 bytes (double) per value + static class Float8Writer extends FloatsWriter { + + protected Float8Writer(Directory dir, String id) throws IOException { + super(dir, id, 8); + } + + @Override + synchronized public void add(int docID, double v) throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; + if (datOut == null) { + initDatOut(); + } + if (docID - lastDocId > 1) { + // fill with default values + lastDocId += fillDefault(docID - lastDocId - 1); + } + assert datOut != null; + datOut.writeLong(Double.doubleToRawLongBits(v)); + ++lastDocId; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; // no data added - don't create file! + if (docCount > lastDocId + 1) + for (int i = lastDocId; i < docCount; i++) { + datOut.writeLong(LONG_ZERO); // default value + } + datOut.close(); + } + + @Override + protected int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeLong(LONG_ZERO); + } + return numValues; + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class FloatsReader extends Reader { + + private final IndexInput datIn; + private final int precisionBytes; + // TODO(simonw) is ByteBuffer the way to go here? + private final int maxDoc; + + protected FloatsReader(Directory dir, String id, int maxDoc) + throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + precisionBytes = datIn.readByte(); + assert precisionBytes == 4 || precisionBytes == 8; + this.maxDoc = maxDoc; + } + + int transferTo(IndexOutput out) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + try { + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + out.copyBytes(indexInput, precisionBytes * maxDoc); + } finally { + indexInput.close(); + } + return maxDoc; + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(precisionBytes * maxDoc); + IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + assert buffer.hasArray() : "Buffer must support Array"; + final byte[] arr = buffer.array(); + indexInput.readBytes(arr, 0, arr.length); + return precisionBytes == 4 ? new Source4(buffer) : new Source8(buffer); + } + + private class Source4 extends Source { + private final FloatBuffer values; + + Source4(ByteBuffer buffer) { + values = buffer.asFloatBuffer(); + } + + @Override + public double floats(int docID) { + final float f = values.get(docID); + // nocommit should we return NaN as default instead of 0.0? + return Float.isNaN(f) ? 0.0f : f; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + values.limit() + * RamUsageEstimator.NUM_BYTES_FLOAT; + } + } + + private class Source8 extends Source { + private final DoubleBuffer values; + + Source8(ByteBuffer buffer) { + values = buffer.asDoubleBuffer(); + } + + @Override + public double floats(int docID) { + final double d = values.get(docID); + // nocommit should we return NaN as default instead of 0.0? + return Double.isNaN(d) ? 0.0d : d; + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + values.limit() + * RamUsageEstimator.NUM_BYTES_DOUBLE; + } + } + + public void close() throws IOException { + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + IndexInput indexInput = (IndexInput) datIn.clone(); + indexInput.seek(CodecUtil.headerLength(CODEC_NAME)); + // skip precision: + indexInput.readByte(); + return precisionBytes == 4 ? new Floats4Enum(source, indexInput, maxDoc) + : new Floats8EnumImpl(source, indexInput, maxDoc); + } + } + + static final class Floats4Enum extends FloatsEnumImpl { + + Floats4Enum(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 4, maxDoc, Values.SIMPLE_FLOAT_4BYTE); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + dataIn.seek(fp + (target * precision)); + ref.val[0] = Float.intBitsToFloat(dataIn.readInt()); + ref.pos = 0; // nocommit -- can we igore this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + + private static final class Floats8EnumImpl extends FloatsEnumImpl { + + Floats8EnumImpl(AttributeSource source, IndexInput dataIn, int maxDoc) + throws IOException { + super(source, dataIn, 8, maxDoc, Values.SIMPLE_FLOAT_8BYTE); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + dataIn.seek(fp + (target * precision)); + ref.val[0] = Double.longBitsToDouble(dataIn.readLong()); + ref.pos = 0; // nocommit -- can we igore this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos + 1); + } + } + + static abstract class FloatsEnumImpl extends ValuesEnum { + protected final IndexInput dataIn; + protected int pos = -1; + protected final int precision; + protected final int maxDoc; + protected final long fp; + protected final FloatsRef ref; + + FloatsEnumImpl(AttributeSource source, IndexInput dataIn, int precision, + int maxDoc, Values type) throws IOException { + super(source, precision == 4 ? Values.SIMPLE_FLOAT_4BYTE + : Values.SIMPLE_FLOAT_8BYTE); + this.dataIn = dataIn; + this.precision = precision; + this.maxDoc = maxDoc; + fp = dataIn.getFilePointer(); + this.ref = attr.floats(); + this.ref.pos = 0; + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/Floats.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Ints.java =================================================================== --- src/java/org/apache/lucene/index/values/Ints.java (revision 0) +++ src/java/org/apache/lucene/index/values/Ints.java (revision 0) @@ -0,0 +1,32 @@ +package org.apache.lucene.index.values; + +import java.io.IOException; +import java.util.Collection; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.PackedIntsImpl.IntsReader; +import org.apache.lucene.index.values.PackedIntsImpl.IntsWriter; +import org.apache.lucene.store.Directory; +//nocommit - add mmap version +//nocommti - add bulk copy where possible +public class Ints { + + private Ints() { + } + + public static void files(String id, Collection files) + throws IOException { + files.add(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + } + + public static Writer getWriter(Directory dir, String id, boolean useFixedArray) + throws IOException { + //nocommit - implement fixed?! + return new IntsWriter(dir, id); + } + + public static Reader getReader(Directory dir, String id, boolean useFixedArray) throws IOException { + return new IntsReader(dir, id); + } +} Property changes on: src/java/org/apache/lucene/index/values/Ints.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/PackedIntsImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/PackedIntsImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/PackedIntsImpl.java (revision 0) @@ -0,0 +1,241 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.values.ValuesEnum.IntsRef; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedInts.PackedIntEnum; + +/** Stores ints packed with fixed-bit precision. */ +class PackedIntsImpl { + + private static final String CODEC_NAME = "PackedInts"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class IntsWriter extends Writer { + // nocommit - can we bulkcopy this on a merge? + private IntsRef intsRef; + private long[] docToValue; + private long minValue; + private long maxValue; + private boolean started; + private final Directory dir; + private final String id; + private int maxDocID; + private int minDocID; + + protected IntsWriter(Directory dir, String id) throws IOException { + this.dir = dir; + this.id = id; + docToValue = new long[1]; + } + + @Override + synchronized public void add(int docID, long v) throws IOException { + + if (!started) { + minValue = maxValue = v; + minDocID = maxDocID = docID; + started = true; + + } else { + if (v < minValue) { + minValue = v; + } else if (v > maxValue) { + maxValue = v; + } + if (docID < minDocID) { + minDocID = docID; + } else if (docID > maxDocID) { + maxDocID = docID; + } + } + if (docID >= docToValue.length) { + docToValue = ArrayUtil.grow(docToValue, 1 + docID); + } + docToValue[docID] = v; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if(!started) + return; + final IndexOutput datOut = dir.createOutput(IndexFileNames + .segmentFileName(id, "", IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.writeHeader(datOut, CODEC_NAME, VERSION_CURRENT); + + // nocommit -- long can't work right since it's signed + datOut.writeLong(minValue); + // write a default value to recognize docs without a value for that field + final long defaultValue = ++maxValue - minValue; + datOut.writeLong(defaultValue); + PackedInts.Writer w = PackedInts.getWriter(datOut, docCount, PackedInts.bitsRequired(maxValue-minValue)); + + final int limit = maxDocID + 1; + for (int i = 0; i < minDocID; i++) { + w.add(defaultValue); + } + for (int i = minDocID; i < limit; i++) { + w.add(docToValue[i] - minValue); + } + for (int i = limit; i < docCount; i++) { + w.add(defaultValue); + } + w.finish(); + + datOut.close(); + } + + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + docToValue.length + * RamUsageEstimator.NUM_BYTES_LONG; + } + + @Override + protected void add(int docID) throws IOException { + add(docID, intsRef.get()); + } + + @Override + protected void setNextAttribute(ValuesAttribute attr) { + intsRef = attr.ints(); + } + } + + /** + * Opens all necessary files, but does not read any data in until you call + * {@link #load}. + */ + static class IntsReader extends Reader { + private final IndexInput datIn; + + protected IntsReader(Directory dir, String id) throws IOException { + datIn = dir.openInput(IndexFileNames.segmentFileName(id, "", + IndexFileNames.CSF_DATA_EXTENSION)); + CodecUtil.checkHeader(datIn, CODEC_NAME, VERSION_START, VERSION_START); + } + + /** + * Loads the actual values. You may call this more than once, eg if you + * already previously loaded but then discarded the Source. + */ + @Override + public Source load() throws IOException { + return new IntsSource((IndexInput) datIn.clone()); + } + + private static class IntsSource extends Source { + private final long minValue; + private final long defaultValue; + private final PackedInts.Reader values; + + public IntsSource(IndexInput dataIn) throws IOException { + dataIn.seek(CodecUtil.headerLength(CODEC_NAME)); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + values = PackedInts.getReader(dataIn); + } + + @Override + public long ints(int docID) { + // nocommit -- can we somehow avoid 2X method calls + // on each get? must push minValue down, and make + // PackedInts implement Ints.Source + final long val = values.get(docID); + // docs not having a value for that field must return a default value + return val == defaultValue ? 0 : minValue + val; + } + + public long ramBytesUsed() { + // TODO(simonw): move that to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + values.getBitsPerValue() * values.size(); + } + } + + public void close() throws IOException { + datIn.close(); + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new IntsEnumImpl(source, (IndexInput) datIn.clone()); + } + + } + + private static final class IntsEnumImpl extends ValuesEnum { + private final PackedIntEnum ints; + private long minValue; + private final IndexInput dataIn; + private final long defaultValue; + private IntsRef ref; + private final int maxDoc; + private int pos = -1; + + private IntsEnumImpl(AttributeSource source, IndexInput dataIn) + throws IOException { + super(source, Values.PACKED_INTS); + this.ref = attr.ints(); + this.ref.pos = 0; + this.dataIn = dataIn; + dataIn.seek(CodecUtil.headerLength(CODEC_NAME)); + minValue = dataIn.readLong(); + defaultValue = dataIn.readLong(); + this.ints = PackedInts.getEnum(dataIn); + maxDoc = ints.size(); + } + + @Override + public void close() throws IOException { + ints.close(); + dataIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) + return pos = NO_MORE_DOCS; + final long val = ints.advance(target); + ref.val[0] = val == defaultValue? 0:minValue + val; + ref.pos = 0; // can we skip this? + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/PackedIntsImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Reader.java =================================================================== --- src/java/org/apache/lucene/index/values/Reader.java (revision 0) +++ src/java/org/apache/lucene/index/values/Reader.java (revision 0) @@ -0,0 +1,109 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Closeable; +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ConsumesRAM; + +public abstract class Reader implements Closeable { + + + public ValuesEnum getEnum() throws IOException{ + return getEnum(null); + } + + public abstract ValuesEnum getEnum(AttributeSource attrSource) throws IOException; + + public abstract Source load() throws IOException; + + public SortedSource loadSorted(Comparator comparator) throws IOException { + throw new UnsupportedOperationException(); + } + + + /** + * Source of integer (returned as java long), per document. The underlying + * implementation may use different numbers of bits per value; long is only + * used since it can handle all precisions. + */ + public static abstract class Source implements ConsumesRAM { + + public long ints(int docID) { + throw new UnsupportedOperationException("ints are not supported"); + } + + public double floats(int docID) { + throw new UnsupportedOperationException("floats are not supported"); + } + + public BytesRef bytes(int docID) { + throw new UnsupportedOperationException("bytes are not supported"); + } + + /** Returns number of unique values. Some impls may + * throw UnsupportedOperationException. */ + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + public ValuesEnum getEnum() throws IOException{ + return getEnum(null); + } + + // nocommit - enable obtaining enum from source since this is already in memory + public /*abstract*/ ValuesEnum getEnum(AttributeSource attrSource) throws IOException { + throw new UnsupportedOperationException(); + } + + } + + public static abstract class SortedSource extends Source { + + @Override + public BytesRef bytes(int docID) { + return getByOrd(ord(docID)); + } + + /** + * Returns ord for specified docID. If this docID had not been added to the + * Writer, the ord is 0. Ord is dense, ie, starts at 0, then increments by 1 + * for the next (as defined by {@link Comparator} value. + */ + public abstract int ord(int docID); + + /** Returns value for specified ord. */ + public abstract BytesRef getByOrd(int ord); + + public static class LookupResult { + public boolean found; + public int ord; + } + + /** + * Finds the largest ord whose value is <= the requested value. If + * {@link LookupResult#found} is true, then ord is an exact match. The + * returned {@link LookupResult} may be reused across calls. + */ + public abstract LookupResult getByValue(BytesRef value); + } + +} Property changes on: src/java/org/apache/lucene/index/values/Reader.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Values.java =================================================================== --- src/java/org/apache/lucene/index/values/Values.java (revision 0) +++ src/java/org/apache/lucene/index/values/Values.java (revision 0) @@ -0,0 +1,48 @@ +package org.apache.lucene.index.values; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Controls whether per-field values are stored into + * index. This storage is non-sparse, so it's best to + * use this when all docs have the field, and loads all + * values into RAM, exposing a random access API, when + * loaded. + * + *

NOTE: This feature is experimental and the + * API is free to change in non-backwards-compatible ways. */ +public enum Values { + + /** Integral value is stored as packed ints. The bit + * precision is fixed across the segment, and + * determined by the min/max values in the field. */ + PACKED_INTS, + PACKED_INTS_FIXED, + SIMPLE_FLOAT_4BYTE, + SIMPLE_FLOAT_8BYTE, + + // nocommit -- shouldn't lucene decide/detect straight vs + // deref, as well fixed vs var? + BYTES_FIXED_STRAIGHT, + BYTES_FIXED_DEREF, + BYTES_FIXED_SORTED, + + BYTES_VAR_STRAIGHT, + BYTES_VAR_DEREF, + BYTES_VAR_SORTED + + // nocommit -- need STRING variants as well +} Property changes on: src/java/org/apache/lucene/index/values/Values.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/ValuesAttribute.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesAttribute.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesAttribute.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.index.values; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Comparator; + +import org.apache.lucene.index.values.ValuesEnum.FloatsRef; +import org.apache.lucene.index.values.ValuesEnum.IntsRef; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.BytesRef; + +public interface ValuesAttribute extends Attribute { + public Values type(); + public BytesRef bytes(); + public FloatsRef floats(); + public IntsRef ints(); + public void setType(Values type); + public Comparator bytesComparator(); + public void setBytesComparator(Comparator comp); + +} \ No newline at end of file Property changes on: src/java/org/apache/lucene/index/values/ValuesAttribute.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java (revision 0) @@ -0,0 +1,126 @@ +package org.apache.lucene.index.values; + +import java.util.Comparator; + +import org.apache.lucene.index.values.ValuesEnum.FloatsRef; +import org.apache.lucene.index.values.ValuesEnum.IntsRef; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SetOnce; + +public class ValuesAttributeImpl extends AttributeImpl implements ValuesAttribute { + private final SetOnce type = new SetOnce(); + private BytesRef bytes = null; + private FloatsRef floats = null; + private IntsRef ints = null; + private Comparator bytesComp; + + public BytesRef bytes() { + return bytes; + } + + public FloatsRef floats() { + return floats; + } + + public IntsRef ints() { + return ints; + } + + public Values type() { + return type.get(); + } + + public void setType(Values type) { + this.type.set(type); + switch (type) { + case BYTES_FIXED_DEREF: + case BYTES_FIXED_SORTED: + case BYTES_FIXED_STRAIGHT: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_VAR_STRAIGHT: + bytes = new BytesRef(); + break; + case PACKED_INTS: + case PACKED_INTS_FIXED: + ints = new IntsRef(); + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + floats = new FloatsRef(); + break; + + } + } + + @Override + public void clear() { + // TODO + } + + @Override + public void copyTo(AttributeImpl target) { + ValuesAttributeImpl other = (ValuesAttributeImpl)target; + // nocommit - we may get rid of setOnce here + other.setType(type.get()); + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + result = prime * result + ((bytes == null) ? 0 : bytes.hashCode()); + result = prime * result + ((floats == null) ? 0 : floats.hashCode()); + result = prime * result + ((ints == null) ? 0 : ints.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (getClass() != obj.getClass()) + return false; + ValuesAttributeImpl other = (ValuesAttributeImpl) obj; + if (bytes == null) { + if (other.bytes != null) + return false; + } else if (!bytes.equals(other.bytes)) + return false; + if (floats == null) { + if (other.floats != null) + return false; + } else if (!floats.equals(other.floats)) + return false; + if (ints == null) { + if (other.ints != null) + return false; + } else if (!ints.equals(other.ints)) + return false; + if (type == null) { + if (other.type != null) + return false; + } else if (!type.equals(other.type)) + return false; + return true; + } + + public Comparator bytesComparator() { + return bytesComp; + } + + public void setBytesComparator(Comparator comp) { + bytesComp = comp; + } + + + +} Property changes on: src/java/org/apache/lucene/index/values/ValuesAttributeImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/ValuesEnum.java =================================================================== --- src/java/org/apache/lucene/index/values/ValuesEnum.java (revision 0) +++ src/java/org/apache/lucene/index/values/ValuesEnum.java (revision 0) @@ -0,0 +1,109 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; + +public abstract class ValuesEnum extends DocIdSetIterator{ + private AttributeSource source; + protected final ValuesAttribute attr; + + + protected ValuesEnum(Values enumType) { + this(null, enumType); + } + + protected ValuesEnum(AttributeSource source, Values enumType) { + this.source = source; + boolean setType = !hasAttribute(ValuesAttribute.class); + attr = addAttribute(ValuesAttribute.class); + if (setType) + attr.setType(enumType); + } + + public AttributeSource attributes() { + if (source == null) + source = new AttributeSource(); + return source; + } + + public T addAttribute(Class attr) { + return attributes().addAttribute(attr); + } + + public T getAttribute(Class attr) { + return attributes().getAttribute(attr); + } + + public boolean hasAttribute(Class attr) { + return attributes().hasAttribute(attr); + } + + public abstract void close() throws IOException; + + // nocommit - factor those out to utils and add common methods like other *Refs provide + // nocommit - this should be LongRef I guess as IntsRef already exists + public static class IntsRef { + public int pos; + public long[] val; + + public IntsRef() { + this(new long[1], 0); + } + + public IntsRef(long[] val, int pos) { + this.val = val; + this.pos = pos; + } + + public long get() { + return val[pos]; + } + + public void set(long value) { + val[pos] = value; + } + } + + // nocommit - factor those out to utils and add common methods like other *Refs provide + public static class FloatsRef { + public int pos; + public double[] val; + + public FloatsRef() { + this(new double[1], 0); + } + + public FloatsRef(double[] val, int pos) { + this.val = val; + this.pos = pos; + } + + public double get() { + return val[pos]; + } + + public void set(double value) { + val[pos] = value; + } + } + +} Property changes on: src/java/org/apache/lucene/index/values/ValuesEnum.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/VarDerefBytesImpl.java (revision 0) @@ -0,0 +1,231 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.index.values.FixedDerefBytesImpl.Reader.DerefBytesEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesHash; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +// Stores variable-length byte[] by deref, ie when two docs +// have the same value, they store only 1 byte[] and both +// docs reference that single source + +class VarDerefBytesImpl { + + static final String CODEC_NAME = "VarDerefBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int[] docToAddress; + private int address = 1; + + final class Entry extends BytesHash.Entry { + int address; + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected VarDerefBytesImpl.Writer.Entry newEntry() { + return new VarDerefBytesImpl.Writer.Entry(); + } + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT; + } + }; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + docToAddress = new int[1]; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default + if(datOut == null) + initDataOut(); + Entry e = hash.add(bytes); + + if (docID >= docToAddress.length) { + docToAddress = ArrayUtil.grow(docToAddress, 1+docID); + } + if (e.address == 0) { + e.address = address; + // New + if (bytes.length < 128) { + // 1 byte to store length + datOut.writeByte((byte) bytes.length); + address++; + } else { + // 2 byte to store length + datOut.writeByte((byte) (0x80 | (bytes.length & 0x7f))); + datOut.writeByte((byte) ((bytes.length>>7) & 0xff)); + address += 2; + } + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } + + docToAddress[docID] = e.address; + } + + synchronized public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_INT * docToAddress.length + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + if(datOut == null) + return; + initIndexOut(); + idxOut.writeInt(address-1); + + // write index + // nocommit -- allow forcing fixed array (not -1) + // TODO(simonw): check the address calculation / make it more intuitive + PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, PackedInts.bitsRequired(address-1)); + final int limit; + if (docCount > docToAddress.length) { + limit = docToAddress.length; + } else { + limit = docCount; + } + for(int i=0;i comp; + + final class Entry extends BytesHash.Entry { + int index; + long offset; + } + + private final BytesHash hash = new BytesHash(Entry.class) { + @Override + protected VarSortedBytesImpl.Writer.Entry newEntry() { + return new VarSortedBytesImpl.Writer.Entry(); + } + + @Override + public long bytesPerEntry() { + return super.bytesPerEntry() + RamUsageEstimator.NUM_BYTES_INT + + RamUsageEstimator.NUM_BYTES_LONG; + } + }; + + public Writer(Directory dir, String id, Comparator comp) + throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + this.comp = comp; + docToEntry = new Entry[1]; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return;// default + if (docID >= docToEntry.length) { + Entry[] newArray = new Entry[ArrayUtil.oversize(1 + docID, + RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(docToEntry, 0, newArray, 0, docToEntry.length); + docToEntry = newArray; + } + docToEntry[docID] = hash.add(bytes); + } + + synchronized public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJ_REF * docToEntry.length + + hash.ramBytesUsed(); + } + + // Important that we get docCount, in case there were + // some last docs that we didn't see + @Override + synchronized public void finish(int docCount) throws IOException { + final int count = hash.size(); + if (count == 0) + return; + initIndexOut(); + initDataOut(); + Entry[] sortedEntries = hash.sort(comp); + + // first dump bytes data, recording index & offset as + // we go + long offset = 0; + long lastOffset = 0; + for (int i = 0; i < count; i++) { + final Entry e = sortedEntries[i]; + e.offset = offset; + e.index = 1 + i; + + final BytesRef bytes = hash.getBytes(e); + // TODO: we could prefix code... + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + lastOffset = offset; + offset += bytes.length; + } + + // total bytes of data + idxOut.writeLong(offset); + + // write index -- first doc -> 1+ord + // nocommit -- allow not -1: + final PackedInts.Writer indexWriter = PackedInts.getWriter(idxOut, + docCount, PackedInts.bitsRequired(count)); + final int limit = docCount > docToEntry.length ? docToEntry.length + : docCount; + for (int i = 0; i < limit; i++) { + final Entry e = docToEntry[i]; + indexWriter.add(e == null ? 0 : e.index); + } + for (int i = limit; i < docCount; i++) { + indexWriter.add(0); + } + indexWriter.finish(); + + // next ord (0-based) -> offset + // nocommit -- allow not -1: + PackedInts.Writer offsetWriter = PackedInts.getWriter(idxOut, count, + PackedInts.bitsRequired(lastOffset)); + for (int i = 0; i < count; i++) { + offsetWriter.add(sortedEntries[i].offset); + } + offsetWriter.finish(); + + super.finish(docCount); + } + } + + public static class Reader extends BytesReaderBase { + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + } + + @Override + public org.apache.lucene.index.values.Reader.Source load() + throws IOException { + return loadSorted(null); + } + + @Override + public SortedSource loadSorted(Comparator comp) + throws IOException { + return new Source(cloneData(), cloneIndex(), comp); + } + + private static class Source extends BytesBaseSortedSource { + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader docToOrdIndex; + private final PackedInts.Reader ordToOffsetIndex; // 0-based + private final long totBytes; + private final int valueCount; + private final LookupResult lookupResult = new LookupResult(); + private final Comparator comp; + + public Source(IndexInput datIn, IndexInput idxIn, + Comparator comp) throws IOException { + super(datIn, idxIn); + totBytes = idxIn.readLong(); + data = new byte[(int) totBytes]; + datIn.readBytes(data, 0, (int) totBytes); + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + bytesRef.bytes = data; + // default byte sort order + this.comp = comp == null ? BytesRef.getUTF8SortedAsUnicodeComparator() + : comp; + + } + + @Override + public BytesRef getByOrd(int ord) { + return ord == 0 ? defaultValue : deref(--ord); + } + + @Override + public int ord(int docID) { + return (int) docToOrdIndex.get(docID); + } + + @Override + public LookupResult getByValue(BytesRef bytes) { + return binarySearch(bytes, 0, valueCount - 1); + } + + public long ramBytesUsed() { + // TODO(simonw): move ram usage to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + docToOrdIndex + .getBitsPerValue() + * docToOrdIndex.getBitsPerValue()) + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + ordToOffsetIndex + .getBitsPerValue() + * ordToOffsetIndex.getBitsPerValue()); + } + + @Override + public int getValueCount() { + return valueCount; + } + + // ord is 0-based + private BytesRef deref(int ord) { + bytesRef.offset = (int) ordToOffsetIndex.get(ord); + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + bytesRef.length = (int) (nextOffset - bytesRef.offset); + return bytesRef; + } + + // TODO: share w/ FixedSortedBytesValues? + private LookupResult binarySearch(BytesRef b, int low, int high) { + + while (low <= high) { + int mid = (low + high) >>> 1; + deref(mid); + final int cmp = comp.compare(bytesRef, b); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + lookupResult.ord = mid + 1; + lookupResult.found = true; + return lookupResult; + } + } + assert comp.compare(bytesRef, b) != 0; + lookupResult.ord = low; + lookupResult.found = false; + return lookupResult; + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarSortedBytesEnum(source, cloneData(), cloneIndex()); + } + + private static class VarSortedBytesEnum extends ValuesEnum { + + private PackedInts.Reader docToOrdIndex; + private PackedInts.Reader ordToOffsetIndex; + private IndexInput idxIn; + private IndexInput datIn; + private final BytesRef bytesRef; + private int valueCount; + private long totBytes; + private int docCount; + private int pos = -1; + private final long fp; + + protected VarSortedBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, Values.BYTES_VAR_SORTED); + bytesRef = attr.bytes(); + totBytes = idxIn.readLong(); + // keep that in memory to prevent lots of disk seeks + docToOrdIndex = PackedInts.getReader(idxIn); + ordToOffsetIndex = PackedInts.getReader(idxIn); + valueCount = ordToOffsetIndex.size(); + docCount = docToOrdIndex.size(); + fp = datIn.getFilePointer(); + this.idxIn = idxIn; + this.datIn = datIn; + } + + @Override + public void close() throws IOException { + idxIn.close(); + datIn.close(); + } + + @Override + public int advance(int target) throws IOException { + if (target >= docCount) + return pos = NO_MORE_DOCS; + final int ord = (int) docToOrdIndex.get(target)-1; + if(ord == -1) { + bytesRef.length = 0; + bytesRef.offset = 0; + return pos = target; + } + final long offset = ordToOffsetIndex.get(ord); + final long nextOffset; + if (ord == valueCount - 1) { + nextOffset = totBytes; + } else { + nextOffset = ordToOffsetIndex.get(1 + ord); + } + final int length = (int) (nextOffset - offset); + datIn.seek(fp + offset); + if (bytesRef.bytes.length < length) + bytesRef.grow(length); + datIn.readBytes(bytesRef.bytes, 0, length); + bytesRef.length = (int) length; + bytesRef.offset = 0; + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java =================================================================== --- src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java (revision 0) +++ src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java (revision 0) @@ -0,0 +1,223 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.values.Bytes.BytesBaseSource; +import org.apache.lucene.index.values.Bytes.BytesReaderBase; +import org.apache.lucene.index.values.Bytes.BytesWriterBase; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.PackedInts; + +// Variable length byte[] per document, no sharing + +class VarStraightBytesImpl { + + static final String CODEC_NAME = "VarStraightBytes"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + static class Writer extends BytesWriterBase { + private int address; + // start at -1 if the first added value is > 0 + private int lastDocID = -1; + private int[] docToAddress; + + public Writer(Directory dir, String id) throws IOException { + super(dir, id, CODEC_NAME, VERSION_CURRENT, false, false); + docToAddress = new int[1]; + } + + // Fills up to but not including this docID + private void fill(final int docID) { + if (docID >= docToAddress.length) { + docToAddress = ArrayUtil.grow(docToAddress, 1 + docID); + } + for (int i = lastDocID + 1; i < docID; i++) { + docToAddress[i] = address; + } + lastDocID = docID; + } + + @Override + synchronized public void add(int docID, BytesRef bytes) throws IOException { + if(bytes.length == 0) + return; // default + if (datOut == null) + initDataOut(); + fill(docID); + docToAddress[docID] = address; + datOut.writeBytes(bytes.bytes, bytes.offset, bytes.length); + address += bytes.length; + } + + @Override + synchronized public void finish(int docCount) throws IOException { + if (datOut == null) + return; + initIndexOut(); + // write all lengths to index + // write index + fill(docCount); + idxOut.writeVInt(address); + // nocommit -- allow not -1 + final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, + PackedInts.bitsRequired(address)); + for (int i = 0; i < docCount; i++) { + w.add(docToAddress[i]); + } + w.finish(); + super.finish(docCount); + } + + synchronized public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + RamUsageEstimator.NUM_BYTES_INT * docToAddress.length; + } + } + + public static class Reader extends BytesReaderBase { + private final int maxDoc; + + Reader(Directory dir, String id, int maxDoc) throws IOException { + super(dir, id, CODEC_NAME, VERSION_START, true); + this.maxDoc = maxDoc; + } + + @Override + public Source load() throws IOException { + return new Source(cloneData(), cloneIndex()); + } + + private class Source extends BytesBaseSource { + private final int totBytes; + // TODO: paged data + private final byte[] data; + private final BytesRef bytesRef = new BytesRef(); + private final PackedInts.Reader addresses; + + public Source(IndexInput datIn, IndexInput idxIn) throws IOException { + super(datIn, idxIn); + totBytes = idxIn.readVInt(); + data = new byte[totBytes]; + datIn.readBytes(data, 0, totBytes); + addresses = PackedInts.getReader(idxIn); + bytesRef.bytes = data; + } + + @Override + public BytesRef bytes(int docID) { + final int address = (int) addresses.get(docID); + bytesRef.offset = address; + if (docID == maxDoc - 1) { + bytesRef.length = totBytes - bytesRef.offset; + } else { + bytesRef.length = (int) addresses.get(1 + docID) - bytesRef.offset; + } + return bytesRef; + } + + @Override + public int getValueCount() { + throw new UnsupportedOperationException(); + } + + public long ramBytesUsed() { + // TODO(simonw): move address ram usage to PackedInts? + return RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + data.length + + (RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + addresses + .getBitsPerValue() + * addresses.size()); + } + } + + @Override + public ValuesEnum getEnum(AttributeSource source) throws IOException { + return new VarStrainghtBytesEnum(source, cloneData(), cloneIndex()); + } + + private class VarStrainghtBytesEnum extends ValuesEnum { + private final PackedInts.Reader addresses; + private final IndexInput datIn; + private final IndexInput idxIn; + private final long fp; + private final int totBytes; + private final BytesRef ref; + private int pos = -1; + + protected VarStrainghtBytesEnum(AttributeSource source, IndexInput datIn, + IndexInput idxIn) throws IOException { + super(source, Values.BYTES_VAR_STRAIGHT); + totBytes = idxIn.readVInt(); + fp = datIn.getFilePointer(); + addresses = PackedInts.getReader(idxIn); + this.datIn = datIn; + this.idxIn = idxIn; + ref = attr.bytes(); + + } + + @Override + public void close() throws IOException { + datIn.close(); + idxIn.close(); + } + + @Override + public int advance(final int target) throws IOException { + if (target >= maxDoc) { + ref.length = 0; + ref.offset = 0; + return pos = NO_MORE_DOCS; + } + final long addr = addresses.get(target); + if (addr == totBytes) { + // nocommit is that a valid default value + ref.length = 0; + ref.offset = 0; + return pos = target; + } + datIn.seek(fp + addr); + final int size = (int) (target == maxDoc - 1 ? totBytes - addr + : addresses.get(target + 1) - addr); + if (ref.bytes.length < size) + ref.grow(size); + ref.length = size; + datIn.readBytes(ref.bytes, 0, size); + return pos = target; + } + + @Override + public int docID() { + return pos; + } + + @Override + public int nextDoc() throws IOException { + return advance(pos+1); + } + } + } +} Property changes on: src/java/org/apache/lucene/index/values/VarStraightBytesImpl.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/index/values/Writer.java =================================================================== --- src/java/org/apache/lucene/index/values/Writer.java (revision 0) +++ src/java/org/apache/lucene/index/values/Writer.java (revision 0) @@ -0,0 +1,94 @@ +package org.apache.lucene.index.values; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.ConsumesRAM; +import org.omg.CORBA.NO_MEMORY; + +public abstract class Writer implements ConsumesRAM { + + /** Records the specfied value for the docID */ + public void add(int docID, long value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + public void add(int docID, double value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + public void add(int docID, BytesRef value) throws IOException { + throw new UnsupportedOperationException(); + } + + /** Records the specfied value for the docID */ + protected abstract void add(int docID) throws IOException; + + protected abstract void setNextAttribute(ValuesAttribute attr); + + /** Finish writing, close any files */ + public abstract void finish(int docCount) throws IOException; + + public static class MergeState { + public final Reader reader; + public final int docBase; + public final int docCount; + public final Bits bits; + + public MergeState(Reader reader, int docBase, int docCount, Bits bits) { + assert reader != null; + this.reader = reader; + this.docBase = docBase; + this.docCount = docCount; + this.bits = bits; + } + } + + public void add(List states) throws IOException { + for (MergeState state : states) { + merge(state); + } + } + + // enables bulk copies in subclasses per MergeState + protected void merge(MergeState state) throws IOException { + final ValuesEnum valEnum = state.reader.getEnum(); + assert valEnum != null; + try { + final ValuesAttribute attr = valEnum.addAttribute(ValuesAttribute.class); + setNextAttribute(attr); + int docID = state.docBase; + final Bits bits = state.bits; + final int docCount = state.docCount; + for (int i = 0; i < docCount; i++) { + if (bits == null || !bits.get(i)) { + if (valEnum.advance(i) == ValuesEnum.NO_MORE_DOCS) + break; + add(docID++); + } + } + } finally { + valEnum.close(); + } + } +} Property changes on: src/java/org/apache/lucene/index/values/Writer.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/search/FieldComparator.java =================================================================== --- src/java/org/apache/lucene/search/FieldComparator.java (revision 983076) +++ src/java/org/apache/lucene/search/FieldComparator.java (working copy) @@ -22,6 +22,7 @@ import java.util.Locale; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.values.Reader.Source; import org.apache.lucene.search.FieldCache.DoubleParser; import org.apache.lucene.search.FieldCache.LongParser; import org.apache.lucene.search.FieldCache.ByteParser; @@ -143,7 +144,6 @@ * @param reader current reader * @param docBase docBase of this reader * @throws IOException - * @throws IOException */ public abstract void setNextReader(IndexReader reader, int docBase) throws IOException; @@ -318,6 +318,64 @@ } } + /** Uses float index values to sort by ascending value */ + public static final class FloatIndexValuesComparator extends FieldComparator { + private final double[] values; + private Source currentReaderValues; + private final String field; + private double bottom; + + FloatIndexValuesComparator(int numHits, String field) { + values = new double[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + final double v1 = values[slot1]; + final double v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + final double v2 = currentReaderValues.floats(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.floats(doc); + } + + @Override + public void setNextReader(IndexReader reader, int docBase) throws IOException { + currentReaderValues = reader.getIndexValuesCache().getFloats(field); + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Double.valueOf(values[slot]); + } + } + /** Parses field's values as float (using {@link * FieldCache#getFloats} and sorts by ascending value */ public static final class FloatComparator extends FieldComparator { @@ -452,6 +510,68 @@ } } + /** Loads int index values and sorts by ascending value. */ + public static final class IntIndexValuesComparator extends FieldComparator { + private final long[] values; + private Source currentReaderValues; + private final String field; + private long bottom; + + IntIndexValuesComparator(int numHits, String field) { + values = new long[numHits]; + this.field = field; + } + + @Override + public int compare(int slot1, int slot2) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v1 = values[slot1]; + final long v2 = values[slot2]; + if (v1 > v2) { + return 1; + } else if (v1 < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public int compareBottom(int doc) { + // TODO: there are sneaky non-branch ways to compute + // -1/+1/0 sign + final long v2 = currentReaderValues.ints(doc); + if (bottom > v2) { + return 1; + } else if (bottom < v2) { + return -1; + } else { + return 0; + } + } + + @Override + public void copy(int slot, int doc) { + values[slot] = currentReaderValues.ints(doc); + } + + @Override + public void setNextReader(IndexReader reader, int docBase) throws IOException { + currentReaderValues = reader.getIndexValuesCache().getInts(field); + } + + @Override + public void setBottom(final int bottom) { + this.bottom = values[bottom]; + } + + @Override + public Comparable value(int slot) { + return Long.valueOf(values[slot]); + } + } + /** Parses field's values as long (using {@link * FieldCache#getLongs} and sorts by ascending value */ public static final class LongComparator extends FieldComparator { Index: src/java/org/apache/lucene/search/ReqExclScorer.java =================================================================== --- src/java/org/apache/lucene/search/ReqExclScorer.java (revision 983076) +++ src/java/org/apache/lucene/search/ReqExclScorer.java (working copy) @@ -23,7 +23,7 @@ /** A Scorer for queries with a required subscorer * and an excluding (prohibited) sub DocIdSetIterator. *
- * This Scorer implements {@link Scorer#skipTo(int)}, + * This Scorer implements {@link Scorer#advance(int)}, * and it uses the skipTo() on the given scorers. */ class ReqExclScorer extends Scorer { Index: src/java/org/apache/lucene/search/ReqOptSumScorer.java =================================================================== --- src/java/org/apache/lucene/search/ReqOptSumScorer.java (revision 983076) +++ src/java/org/apache/lucene/search/ReqOptSumScorer.java (working copy) @@ -21,7 +21,7 @@ /** A Scorer for queries with a required part and an optional part. * Delays skipTo() on the optional part until a score() is needed. *
- * This Scorer implements {@link Scorer#skipTo(int)}. + * This Scorer implements {@link Scorer#advance(int)}. */ class ReqOptSumScorer extends Scorer { /** The scorers passed from the constructor. Index: src/java/org/apache/lucene/search/SortField.java =================================================================== --- src/java/org/apache/lucene/search/SortField.java (revision 983076) +++ src/java/org/apache/lucene/search/SortField.java (working copy) @@ -19,9 +19,15 @@ import java.io.IOException; import java.io.Serializable; +import java.util.Comparator; import java.util.Locale; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRef; + +// nocommit -- for cleaner transition, maybe we should make +// a new SortField that subclasses this one and always uses +// index values? /** * Stores information about how to sort documents by terms in an individual @@ -83,6 +89,9 @@ * uses ordinals to do the sorting. */ public static final int STRING_VAL = 11; + /** Sort use byte[] index values. */ + public static final int BYTES = 12; + /** Represents sorting by document score (relevancy). */ public static final SortField FIELD_SCORE = new SortField (null, SCORE); @@ -358,6 +367,26 @@ field = StringHelper.intern(field); } + private boolean useIndexValues; + + public void setUseIndexValues(boolean b) { + useIndexValues = b; + } + + public boolean getUseIndexValues() { + return useIndexValues; + } + + private Comparator bytesComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); + + public void setBytesComparator(Comparator b) { + bytesComparator = b; + } + + public Comparator getBytesComparator() { + return bytesComparator; + } + /** Returns the {@link FieldComparator} to use for * sorting. * @@ -387,10 +416,18 @@ return new FieldComparator.DocComparator(numHits); case SortField.INT: - return new FieldComparator.IntComparator(numHits, field, parser); + if (useIndexValues) { + return new FieldComparator.IntIndexValuesComparator(numHits, field); + } else { + return new FieldComparator.IntComparator(numHits, field, parser); + } case SortField.FLOAT: - return new FieldComparator.FloatComparator(numHits, field, parser); + if (useIndexValues) { + return new FieldComparator.FloatIndexValuesComparator(numHits, field); + } else { + return new FieldComparator.FloatComparator(numHits, field, parser); + } case SortField.LONG: return new FieldComparator.LongComparator(numHits, field, parser); Index: src/java/org/apache/lucene/util/BytesHash.java =================================================================== --- src/java/org/apache/lucene/util/BytesHash.java (revision 0) +++ src/java/org/apache/lucene/util/BytesHash.java (revision 0) @@ -0,0 +1,377 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// nocommit -- move to util? +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.index.ByteBlockPool; + +/** + * Hash's BytesRefs. BytesRef must be no longer than XXX in + * length. + * + *

NOTE: this class is meant only to be used internally + * by Lucene; it's only public so it can be shared across + * packages. This means the API is freely subject to + * change, and, the class could be removed entirely, in any + * Lucene release. Use directly at your own risk! + */ + +// nocommit -- reuse Entry instances? +public abstract class BytesHash { + + // nocommit -- factor properly so the byte pool uses this + // NOT DW's + public final static int BYTES_BLOCK_SHIFT = 15; + public final static int BYTES_BLOCK_SIZE = 1 << BYTES_BLOCK_SHIFT; + public final static int BYTES_BLOCK_MASK = BYTES_BLOCK_SIZE - 1; + + // nocommit -- reuse? + private static class ByteBlockAllocator extends ByteBlockPool.Allocator { + int blockUsedCount; + + @Override + public byte[] getByteBlock() { + blockUsedCount++; + return new byte[BYTES_BLOCK_SIZE]; + } + + @Override + public void recycleByteBlocks(byte[][] blocks, int start, int end) { + blockUsedCount -= end-start; + } + + public long ramBytesUsed() { + return blockUsedCount * BYTES_BLOCK_SIZE; + } + + @Override + public void recycleByteBlocks(List blocks) { + blockUsedCount -= blocks.size(); + } + } + + public static class Entry { + public int bytesStart; + } + + private final Class cl; + public final ByteBlockPool pool; + private int hashSize = 4; + private int hashHalfSize = hashSize/2; + private int hashMask = hashSize-1; + private int count; + private int lastCount = -1; + private final ByteBlockAllocator allocator; + private T[] hash; + + @SuppressWarnings("unchecked") + public BytesHash(Class cl) { + this.cl = cl; + allocator = new ByteBlockAllocator(); + pool = new ByteBlockPool(allocator); + hash = (T[]) Array.newInstance(cl, hashSize); + } + + public int size() { + return count; + } + + public BytesRef getBytes(T e) { + return deref(e.bytesStart, scratch1); + } + + /** Destructive operation -- returns all Entry instances, + * in arbitrary order */ + public T[] compact() { + int upto = 0; + for(int i=0;icomp) { + compact(); + quickSort(comp, hash, 0, count-1); + return hash; + } + + void quickSort(Comparator comp, T[] entries, int lo, int hi) { + if (lo >= hi) + return; + else if (hi == 1+lo) { + if (compare(comp, entries[lo], entries[hi]) > 0) { + final T tmp = entries[lo]; + entries[lo] = entries[hi]; + entries[hi] = tmp; + } + return; + } + int mid = (lo + hi) >>> 1; + if (compare(comp, entries[lo], entries[mid]) > 0) { + T tmp = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp; + } + + if (compare(comp, entries[mid], entries[hi]) > 0) { + T tmp = entries[mid]; + entries[mid] = entries[hi]; + entries[hi] = tmp; + + if (compare(comp, entries[lo], entries[mid]) > 0) { + T tmp2 = entries[lo]; + entries[lo] = entries[mid]; + entries[mid] = tmp2; + } + } + + int left = lo + 1; + int right = hi - 1; + + if (left >= right) + return; + + T partition = entries[mid]; + + for (; ;) { + while (compare(comp, entries[right], partition) > 0) + --right; + + while (left < right && compare(comp, entries[left], partition) <= 0) + ++left; + + if (left < right) { + T tmp = entries[left]; + entries[left] = entries[right]; + entries[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(comp, entries, lo, left); + quickSort(comp, entries, left + 1, hi); + } + + private final BytesRef scratch1 = new BytesRef(); + private final BytesRef scratch2 = new BytesRef(); + + private final BytesRef deref(int bytesStart, BytesRef b) { + b.bytes = pool.buffers[bytesStart >> BYTES_BLOCK_SHIFT]; + int pos = bytesStart & BYTES_BLOCK_MASK; + + if ((b.bytes[pos] & 0x80) == 0) { + // length is 1 byte + b.length = b.bytes[pos]; + pos += 1; + } else { + // length is 2 bytes + b.length = (b.bytes[pos]&0x7f) + ((b.bytes[pos+1]&0xff)<<7); + pos += 2; + } + b.offset = pos; + return b; + } + + private boolean equals(T e, BytesRef b) { + return deref(e.bytesStart, scratch1).bytesEquals(b); + } + + private int compare(Comparator comp, T e1, T e2) { + return comp.compare(deref(e1.bytesStart, scratch1), + deref(e2.bytesStart, scratch2)); + } + + @SuppressWarnings("unchecked") + private boolean shrink(int targetSize) { + + // Cannot use ArrayUtil.shrink because we require power + // of 2: + int newSize = hashSize; + while(newSize >= 8 && newSize/4 > targetSize) { + newSize /= 2; + } + + if (newSize != hashSize) { + hashSize = newSize; + hash = (T[]) Array.newInstance(cl, hashSize); + hashHalfSize = newSize/2; + hashMask = newSize-1; + return true; + } else { + return false; + } + } + + public void clear() { + lastCount = count; + count = 0; + if (lastCount != -1) { + if (shrink(lastCount)) { + // shrink clears the hash entries + return; + } + } + Arrays.fill(hash, null); + } + + public T add(BytesRef bytes) { + int code = 0; + final int end = bytes.offset + bytes.length; + // build hash + for(int i=bytes.offset;i>8)+code)|1; + do { + code += inc; + hashPos = code & hashMask; + e = hash[hashPos]; + } while (e != null && !equals(e, bytes)); + } + + if (e == null) { + // new entry + final int len2 = 2+bytes.length; + if (len2 + pool.byteUpto > BYTES_BLOCK_SIZE) { + if (len2 > BYTES_BLOCK_SIZE) { + throw new IllegalArgumentException("bytes can be at most " + (BYTES_BLOCK_SIZE-2) + " in length; got " + bytes.length); + } + pool.nextBuffer(); + } + + e = newEntry(); + + final byte[] buffer = pool.buffer; + final int bufferUpto = pool.byteUpto; + e.bytesStart = bufferUpto + pool.byteOffset; + + // We first encode the length, followed by the + // bytes. Length is encoded as vInt, but will consume + // 1 or 2 bytes at most (we reject too-long terms, + // above). + if (bytes.length < 128) { + // 1 byte to store length + buffer[bufferUpto] = (byte) bytes.length; + pool.byteUpto += bytes.length + 1; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto+1, bytes.length); + } else { + // 2 byte to store length + buffer[bufferUpto] = (byte) (0x80 | (bytes.length & 0x7f)); + buffer[bufferUpto+1] = (byte) ((bytes.length>>7) & 0xff); + pool.byteUpto += bytes.length + 2; + System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto+2, bytes.length); + } + assert hash[hashPos] == null; + hash[hashPos] = e; + count++; + + if (count == hashHalfSize) { + rehash(2*hashSize); + } + } + return e; + } + + /** Called when postings hash is too small (> 50% + * occupied) or too large (< 20% occupied). */ + void rehash(final int newSize) { + + final int newMask = newSize-1; + + @SuppressWarnings("unchecked") + T[] newHash = (T[]) Array.newInstance(cl, newSize); + for(int i=0;i> BYTES_BLOCK_SHIFT]; + code = 0; + + final int len; + int pos; + if ((bytes[start] & 0x80) == 0) { + // length is 1 byte + len = bytes[start]; + pos = start+1; + } else { + len = (bytes[start]&0x7f) + ((bytes[start+1]&0xff)<<7); + pos = start+2; + } + + final int endPos = pos+len; + while(pos < endPos) { + code = (code*31) + bytes[pos++]; + } + + int hashPos = code & newMask; + assert hashPos >= 0; + if (newHash[hashPos] != null) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + hashPos = code & newMask; + } while (newHash[hashPos] != null); + } + newHash[hashPos] = e0; + } + } + + hashMask = newMask; + hash = newHash; + hashSize = newSize; + hashHalfSize = newSize >> 1; + } + + protected abstract T newEntry(); + + public long ramBytesUsed() { + return allocator.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJ_REF * hashSize + count * bytesPerEntry(); + } + + protected long bytesPerEntry() { + return RamUsageEstimator.NUM_BYTES_OBJ_HEADER + RamUsageEstimator.NUM_BYTES_INT; + } +} Property changes on: src/java/org/apache/lucene/util/BytesHash.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- src/java/org/apache/lucene/util/BytesRef.java (revision 983076) +++ src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -18,6 +18,7 @@ */ import java.util.Comparator; +import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.io.ObjectInput; import java.io.ObjectOutput; @@ -259,12 +260,13 @@ } private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); - + public static Comparator getUTF8SortedAsUnicodeComparator() { return utf8SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUnicodeComparator implements Comparator { + @SuppressWarnings("serial") // serializable to work with contrib/remote + private static final class UTF8SortedAsUnicodeComparator implements Serializable, Comparator { // Only singleton private UTF8SortedAsUnicodeComparator() {}; Index: src/java/org/apache/lucene/util/ConsumesRAM.java =================================================================== --- src/java/org/apache/lucene/util/ConsumesRAM.java (revision 0) +++ src/java/org/apache/lucene/util/ConsumesRAM.java (revision 0) @@ -0,0 +1,22 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public interface ConsumesRAM { + public long ramBytesUsed(); +} Property changes on: src/java/org/apache/lucene/util/ConsumesRAM.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/java/org/apache/lucene/util/packed/Packed64.java =================================================================== --- src/java/org/apache/lucene/util/packed/Packed64.java (revision 983076) +++ src/java/org/apache/lucene/util/packed/Packed64.java (working copy) @@ -182,7 +182,7 @@ final int bitPos = (int)(majorBitPos & MOD_MASK); // % BLOCK_SIZE); final int base = bitPos * FAC_BITPOS; - + assert elementPos < blocks.length : "elementPos: " + elementPos + "; blocks.len: " + blocks.length; return ((blocks[elementPos] << shifts[base]) >>> shifts[base+1]) | ((blocks[elementPos+1] >>> shifts[base+2]) & readMasks[bitPos]); } Index: src/java/org/apache/lucene/util/packed/PackedInts.java =================================================================== --- src/java/org/apache/lucene/util/packed/PackedInts.java (revision 983076) +++ src/java/org/apache/lucene/util/packed/PackedInts.java (working copy) @@ -77,6 +77,21 @@ /** Returns number of values */ int size(); } + + /** + * Run-once forward only enum interface, to decode and skipt previously + * saved {@link PackedInts} + */ + public static interface PackedIntEnum extends ReaderIterator { + + /** Returns the current position of the enum */ + int docID(); + + /** Skips to the position in the enum and returns its value. + * @return the value at the given position + * @throws IOException if reading the value throws an IOException*/ + long advance(int position) throws IOException; + } /** * A packed integer array that can be modified. @@ -192,9 +207,13 @@ final int bitsPerValue = in.readVInt(); assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; final int valueCount = in.readVInt(); - return new PackedReaderIterator(bitsPerValue, valueCount, in); } + + public static PackedIntEnum getEnum(IndexInput in ) throws IOException{ + // ReaderIterator implements PackedIntEnum + return (PackedIntEnum)getReaderIterator(in); + } /** * Create a packed integer array with the given amount of values initialized Index: src/java/org/apache/lucene/util/packed/PackedReaderIterator.java =================================================================== --- src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (revision 983076) +++ src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (working copy) @@ -21,12 +21,13 @@ import java.io.IOException; -class PackedReaderIterator implements PackedInts.ReaderIterator { +class PackedReaderIterator implements PackedInts.PackedIntEnum { private long pending; private int pendingBitsLeft; private final IndexInput in; private final int bitsPerValue; private final int valueCount; + private int position = -1; // masks[n-1] masks for bottom n bits private final long[] masks; @@ -57,16 +58,16 @@ } public long next() throws IOException { + if (pendingBitsLeft == 0) { pending = in.readLong(); pendingBitsLeft = 64; } - + final long result; if (pendingBitsLeft >= bitsPerValue) { // not split - final long result = (pending >> (pendingBitsLeft - bitsPerValue)) & masks[bitsPerValue-1]; + result = (pending >> (pendingBitsLeft - bitsPerValue)) & masks[bitsPerValue-1]; pendingBitsLeft -= bitsPerValue; - return result; } else { // split final int bits1 = bitsPerValue - pendingBitsLeft; @@ -74,11 +75,37 @@ pending = in.readLong(); final long result2 = (pending >> (64 - bits1)) & masks[bits1-1]; pendingBitsLeft = 64 + pendingBitsLeft - bitsPerValue; - return result1 | result2; + result = result1 | result2; } + ++position; + return result; } public void close() throws IOException { in.close(); } + + public int docID() { + return position; + } + + public long advance(final int ord) throws IOException{ + assert ord < valueCount : "ord must be less than valueCount"; + assert ord > position : "ord must be greater than the current position"; + final int posToSkip = ord - 1 - position; + final long bitsToSkip = ((long)bitsPerValue * (long)posToSkip); + if(bitsToSkip < pendingBitsLeft ){ + pendingBitsLeft -= bitsToSkip; + }else { + final long skip = bitsToSkip- pendingBitsLeft; + final long filePointer = in.getFilePointer(); + final long closestByte = (skip >> 6) << 3; + if(closestByte > 0) + in.seek(filePointer + closestByte); + pending = in.readLong(); + pendingBitsLeft = 64 - (int)(skip % 64); + } + position = ord-1; + return next(); + } } Index: src/test/org/apache/lucene/index/TestByteSlices.java =================================================================== --- src/test/org/apache/lucene/index/TestByteSlices.java (revision 983076) +++ src/test/org/apache/lucene/index/TestByteSlices.java (working copy) @@ -26,7 +26,7 @@ /* Allocate another byte[] from the shared pool */ @Override - synchronized byte[] getByteBlock() { + public synchronized byte[] getByteBlock() { final int size = freeByteBlocks.size(); final byte[] b; if (0 == size) @@ -38,13 +38,13 @@ /* Return a byte[] to the pool */ @Override - synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { + public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) { for(int i=start;i blocks) { + public synchronized void recycleByteBlocks(List blocks) { final int size = blocks.size(); for(int i=0;i comp = mode == Bytes.Mode.SORTED ? BytesRef + .getUTF8SortedAsUnicodeComparator() + : null; + + Directory dir = new MockRAMDirectory(); + Writer w = Bytes + .getWriter(dir, "test", mode, comp, fixedSize); + int maxDoc = 220; + final String[] values = new String[maxDoc]; + final int lenMin, lenMax; + if (fixedSize) { + lenMin = lenMax = 3 + rand.nextInt(7); + } else { + lenMin = 1; + lenMax = 15 + rand.nextInt(6); + } + for (int i = 0; i < 100; i++) { + final String s; + if (i > 0 && rand.nextInt(5) <= 2) { + // use prior value + s = values[2 * rand.nextInt(i)]; + } else { + s = _TestUtil.randomUnicodeString(rand, lenMin, lenMax); + } + values[2 * i] = s; + + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytesRef); + w.add(2 * i, bytesRef); + } + w.finish(maxDoc); + + Reader r = Bytes.getReader(dir, "test", mode, fixedSize, maxDoc); + for (int iter = 0; iter < 2; iter++) { + ValuesEnum bytesEnum = r.getEnum(); + assertNotNull("enum is null", bytesEnum); + ValuesAttribute attr = bytesEnum.addAttribute(ValuesAttribute.class); + assertNotNull("attribute is null", attr); + BytesRef ref = attr.bytes(); + assertNotNull("BytesRef is null - enum not initialized to use bytes", attr); + + for (int i = 0; i < 2; i++) { + final int idx = 2 * i; + assertEquals("doc: " + idx, idx, bytesEnum.advance(idx)); + assertEquals("doc: " + idx, values[idx], ref.utf8ToString()); + } + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc)); + assertEquals(ValuesEnum.NO_MORE_DOCS, bytesEnum.advance(maxDoc+1)); + + bytesEnum.close(); + } + + + // Verify we can load source twice: + for (int iter = 0; iter < 2; iter++) { + Source s; + Reader.SortedSource ss; + if (mode == Bytes.Mode.SORTED) { + s = ss = r.loadSorted(comp); + } else { + s = r.load(); + ss = null; + } + + for (int i = 0; i < 100; i++) { + final int idx = 2 * i; + assertNotNull("doc " + idx + "; value=" + values[idx], s.bytes(idx)); + assertEquals("doc " + idx, values[idx], s.bytes(idx).utf8ToString()); + if (ss != null) { + assertEquals("doc " + idx, values[idx], ss.getByOrd(ss.ord(idx)) + .utf8ToString()); + Reader.SortedSource.LookupResult result = ss.getByValue(new BytesRef( + values[idx])); + assertTrue(result.found); + assertEquals(ss.ord(idx), result.ord); + } + } + + // Lookup random strings: + if (mode == Bytes.Mode.SORTED) { + final int numValues = ss.getValueCount(); + for (int i = 0; i < 1000; i++) { + BytesRef bytesValue = new BytesRef(_TestUtil.randomUnicodeString( + rand, lenMin, lenMax)); + SortedSource.LookupResult result = ss.getByValue(bytesValue); + if (result.found) { + assert result.ord > 0; + assertTrue(bytesValue.bytesEquals(ss.getByOrd(result.ord))); + int count = 0; + for (int k = 0; k < 100; k++) { + if (bytesValue.utf8ToString().equals(values[2 * k])) { + assertEquals(ss.ord(2 * k), result.ord); + count++; + } + } + assertTrue(count > 0); + } else { + assert result.ord >= 0; + if (result.ord == 0) { + final BytesRef firstRef = ss.getByOrd(1); + // random string was before our first + assertTrue(firstRef.compareTo(bytesValue) > 0); + } else if (result.ord == numValues) { + final BytesRef lastRef = ss.getByOrd(numValues); + // random string was after our last + assertTrue(lastRef.compareTo(bytesValue) < 0); + } else { + // random string fell between two of our values + final BytesRef before = (BytesRef) ss.getByOrd(result.ord) + .clone(); + final BytesRef after = ss.getByOrd(result.ord + 1); + assertTrue(before.compareTo(bytesValue) < 0); + assertTrue(bytesValue.compareTo(after) < 0); + + } + } + } + } + } + + r.close(); + dir.close(); + } + + public void testInts() throws IOException { + final Random rand = newRandom(); + long maxV = 1; + final int NUM_VALUES = 1000; + final long[] values = new long[NUM_VALUES]; + for (int rx = 1; rx < 63; rx++, maxV *= 2) { + for (int b = 0; b < 2; b++) { + Directory dir = new MockRAMDirectory(); + boolean useFixedArrays = b == 0; + Writer w = Ints.getWriter(dir, "test", useFixedArrays); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = rand.nextLong() % (1 + maxV); + values[i] = v; + w.add(i, v); + } + final int additionalDocs = 1 + rand.nextInt(9); + w.finish(NUM_VALUES + additionalDocs); + + Reader r = Ints.getReader(dir, "test", useFixedArrays); + for (int iter = 0; iter < 2; iter++) { + Source s = r.load(); + for (int i = 0; i < NUM_VALUES; i++) { + final long v = s.ints(i); + assertEquals("index " + i + " b: " + b, values[i], v); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = r.getEnum(); + ValuesAttribute attr = iEnum.addAttribute(ValuesAttribute.class); + IntsRef ints = attr.ints(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(i, iEnum.nextDoc()); + assertEquals(values[i], ints.get()); + } + for (int i = NUM_VALUES; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.nextDoc()); + assertEquals("" + i, 0, ints.get()); + } + + iEnum.close(); + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum iEnum = r.getEnum(); + ValuesAttribute attr = iEnum.addAttribute(ValuesAttribute.class); + IntsRef ints = attr.ints(); + for (int i = 0; i < NUM_VALUES; i += 1 + rand.nextInt(25)) { + assertEquals(i, iEnum.advance(i)); + assertEquals(values[i], ints.get()); + } + for (int i = NUM_VALUES; i < NUM_VALUES + additionalDocs; i++) { + assertEquals(i, iEnum.advance(i)); + assertEquals("" + i, 0, ints.get()); + } + + iEnum.close(); + } + r.close(); + dir.close(); + } + } + } + + public void testFloats4() throws IOException { + runTestFloats(newRandom(), 4, 0.00001); + } + + private void runTestFloats(Random rand, int precision, double delta) + throws IOException { + Directory dir = new MockRAMDirectory(); + Writer w = Floats.getWriter(dir, "test", precision); + final int NUM_VALUES = 1000; + final double[] values = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + final double v = precision == 4 ? rand.nextFloat() : rand.nextDouble(); + values[i] = v; + w.add(i, v); + } + final int additionalValues = 1 + rand.nextInt(10); + w.finish(NUM_VALUES + additionalValues); + + Reader r = Floats.getReader(dir, "test", NUM_VALUES + + additionalValues); + for (int iter = 0; iter < 2; iter++) { + Source s = r.load(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(values[i], s.floats(i)); + } + } + + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = r.getEnum(); + ValuesAttribute attr = fEnum.addAttribute(ValuesAttribute.class); + FloatsRef floats = attr.floats(); + for (int i = 0; i < NUM_VALUES; i++) { + assertEquals(i, fEnum.nextDoc()); + assertEquals(values[i], floats.get(), delta); + } + for(int i = NUM_VALUES; i < NUM_VALUES + additionalValues; i++) { + assertEquals(i, fEnum.nextDoc()); + assertEquals(0.0, floats.get(), delta); + } + fEnum.close(); + } + for (int iter = 0; iter < 2; iter++) { + ValuesEnum fEnum = r.getEnum(); + ValuesAttribute attr = fEnum.addAttribute(ValuesAttribute.class); + FloatsRef floats = attr.floats(); + for (int i = 0; i < NUM_VALUES; i += 1 + rand.nextInt(25)) { + assertEquals(i, fEnum.advance(i)); + assertEquals(values[i], floats.get(), delta); + } + for(int i = NUM_VALUES; i < NUM_VALUES + additionalValues; i++) { + assertEquals(i, fEnum.advance(i)); + assertEquals(0.0, floats.get(), delta); + } + fEnum.close(); + } + + r.close(); + dir.close(); + } + + public void testFloats8() throws IOException { + runTestFloats(newRandom(), 8, 0.0); + } + + /** + * Tests complete indexing of {@link Values} including deletions, merging and + * sparse value fields on Compound-File + */ + public void testCFSIndex() throws IOException { + final Random rand = newRandom(); + // without deletions + IndexWriterConfig cfg = writerConfig(rand, true); + // primitives - no deletes + runTestNumerics(cfg, rand, false); + + cfg = writerConfig(rand, true); + // bytes - no deletes + runTestIndexBytes(cfg, rand, false); + + // with deletions + cfg = writerConfig(rand, true); + // primitives + runTestNumerics(cfg, rand, true); + + cfg = writerConfig(rand, true); + // bytes + runTestIndexBytes(cfg, rand, true); + } + + /** + * Tests complete indexing of {@link Values} including deletions, merging and + * sparse value fields on None-Compound-File + */ + public void testIndex() throws IOException { + // + final Random rand = newRandom(); + // without deletions + IndexWriterConfig cfg = writerConfig(rand, false); + // primitives - no deletes + runTestNumerics(cfg, rand, false); + + cfg = writerConfig(rand, false); + // bytes - no deletes + runTestIndexBytes(cfg, rand, false); + + // with deletions + cfg = writerConfig(rand, false); + // primitives + runTestNumerics(cfg, rand, true); + + cfg = writerConfig(rand, false); + // bytes + runTestIndexBytes(cfg, rand, true); + } + + private IndexWriterConfig writerConfig(Random rand, boolean useCompoundFile) { + final IndexWriterConfig cfg = newIndexWriterConfig(rand, + TEST_VERSION_CURRENT, new MockAnalyzer()); + MergePolicy mergePolicy = cfg.getMergePolicy(); + if(mergePolicy instanceof LogMergePolicy) { + ((LogMergePolicy)mergePolicy).setUseCompoundFile(useCompoundFile); + } else if(useCompoundFile) { + LogMergePolicy policy = new LogDocMergePolicy(); + policy.setUseCompoundFile(useCompoundFile); + cfg.setMergePolicy(policy); + } + return cfg; + } + + public void runTestNumerics(IndexWriterConfig cfg, Random rand, + boolean withDeletions) throws IOException { + Directory d = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final int numValues = 350; + final List numVariantList = new ArrayList(NUMERICS); + + // run in random order to test if fill works correctly during merges + Collections.shuffle(numVariantList, rand); + for (Values val : numVariantList) { + OpenBitSet deleted = indexValues(rand, w, numValues, val, numVariantList, + withDeletions, 7); + List closeables = new ArrayList(); + IndexReader r = w.getReader(); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + switch (val) { + case PACKED_INTS: + case PACKED_INTS_FIXED: { + Reader intsReader = r.getIndexValues(val.name()); + Source ints = intsReader.load(); + ValuesEnum intsEnum = intsReader.getEnum(); + assertNotNull(intsEnum); + IntsRef enumRef = intsEnum.addAttribute(ValuesAttribute.class).ints(); + closeables.add(intsReader); + for (int i = 0; i < base; i++) { + assertEquals(0, ints.ints(i)); + assertEquals(val.name() + " base: " + base + " index: " + i, i, rand.nextBoolean()?intsEnum.advance(i): intsEnum.nextDoc()); + assertEquals(0, enumRef.get()); + } + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + " docs", i, intsEnum.advance(i)); + assertEquals(expected, ints.ints(i)); + assertEquals(expected, enumRef.get()); + + } + } + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: { + Reader floatReader = r.getIndexValues(val.name()); + Source floats = floatReader.load(); + ValuesEnum floatEnum = floatReader.getEnum(); + assertNotNull(floatEnum); + FloatsRef enumRef = floatEnum.addAttribute(ValuesAttribute.class).floats(); + closeables.add(floatReader); + + for (int i = 0; i < base; i++) { + assertEquals(0.0d, floats.floats(i)); + assertEquals(i, rand.nextBoolean()?floatEnum.advance(i): floatEnum.nextDoc()); + assertEquals("index " + i, 0.0 ,enumRef.get()); + } + int expected = 0; + for (int i = base; i < r.numDocs(); i++, expected++) { + while (deleted.get(expected)) { + expected++; + } + assertEquals("advance failed at index: " + i + " of " + r.numDocs() + " docs base:" + base, i, floatEnum.advance(i)); + assertEquals("index " + i, 2.0 * expected ,enumRef.get() , 0.00001); + assertEquals("index " + i, 2.0 * expected, floats.floats(i), 0.00001); + } + } + break; + default: + fail("unexpected value " + val); + } + + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + w.close(); + d.close(); + } + + private static EnumSet BYTES = EnumSet.of( + Values.BYTES_FIXED_DEREF, + Values.BYTES_FIXED_SORTED, + Values.BYTES_FIXED_STRAIGHT, + Values.BYTES_VAR_DEREF , + Values.BYTES_VAR_SORTED, + Values.BYTES_VAR_STRAIGHT + ); + + private static EnumSet STRAIGHT_BYTES = EnumSet.of( + Values.BYTES_FIXED_STRAIGHT, + Values.BYTES_VAR_STRAIGHT + ); + + private static EnumSet NUMERICS = EnumSet.of(Values.PACKED_INTS, Values.PACKED_INTS_FIXED, Values.SIMPLE_FLOAT_4BYTE, Values.SIMPLE_FLOAT_8BYTE); + + private OpenBitSet indexValues(Random rand, IndexWriter w, int numValues, + Values value, List valueVarList, boolean withDeletions, + int multOfSeven) throws CorruptIndexException, IOException { + final boolean isNumeric = NUMERICS.contains(value); + OpenBitSet deleted = new OpenBitSet(numValues); + Document doc = new Document(); + Fieldable fieldable = new AttributeField(value.name()); + ValuesAttribute valuesAttribute = fieldable.attributes().addAttribute(ValuesAttribute.class); + valuesAttribute.setType(value); + doc.add(fieldable); + + final IntsRef intsRef = valuesAttribute.ints(); + final FloatsRef floatsRef = valuesAttribute.floats(); + final BytesRef bytesRef = valuesAttribute.bytes(); + + final String idBase = value.name() + "_"; + final byte[] b = new byte[multOfSeven]; + if (bytesRef != null) { + bytesRef.bytes = b; + bytesRef.length = b.length; + bytesRef.offset = 0; + } + + byte upto = 0; + for (int i = 0; i < numValues; i++) { + if (isNumeric) { + switch (value) { + case PACKED_INTS: + case PACKED_INTS_FIXED: + intsRef.set(i); + break; + case SIMPLE_FLOAT_4BYTE: + case SIMPLE_FLOAT_8BYTE: + floatsRef.set(2.0f * i); + break; + default: + fail("unexpected value " + value); + } + } else { + for (int j = 0; j < b.length; j++) { + b[j] = upto++; + } + } + doc.removeFields("id"); + doc.add(new Field("id", idBase + i, Store.YES, + Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + if (i % 7 == 0) { + if (withDeletions && rand.nextBoolean()) { + Values val = valueVarList.get(rand.nextInt(1 + valueVarList + .indexOf(value))); + final int randInt = val == value ? rand.nextInt(1 + i) : rand + .nextInt(numValues); + w.deleteDocuments(new Term("id", val.name() + "_" + randInt)); + if (val == value) { + deleted.set(randInt); + } + } + w.commit(); + + } + } + w.commit(); + + // nocommit test unoptimized with deletions + if(withDeletions || rand.nextBoolean()) + w.optimize(); + return deleted; + } + + public void runTestIndexBytes(IndexWriterConfig cfg, Random rand, + boolean withDeletions) throws CorruptIndexException, + LockObtainFailedException, IOException { + Directory d = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(d, cfg); + final List byteVariantList = new ArrayList(BYTES); + + // run in random order to test if fill works correctly during merges + Collections.shuffle(byteVariantList, rand); + final int numValues = 350; + for (Values byteIndexValue : byteVariantList) { + int bytesSize = 7 + rand.nextInt(128); + OpenBitSet deleted = indexValues(rand, w, numValues, byteIndexValue, + byteVariantList, withDeletions, bytesSize); + final IndexReader r = w.getReader(); + assertEquals(0, r.numDeletedDocs()); + List closeables = new ArrayList(); + final int numRemainingValues = (int) (numValues - deleted.cardinality()); + final int base = r.numDocs() - numRemainingValues; + + Reader bytesReader = r.getIndexValues(byteIndexValue.name()); + closeables.add(bytesReader); + assertNotNull("field " + byteIndexValue.name() + + " returned null reader - maybe merged failed", bytesReader); + Source bytes = bytesReader.load(); + ValuesEnum bytesEnum = bytesReader.getEnum(); + assertNotNull(bytesEnum); + final ValuesAttribute attr = bytesEnum.addAttribute(ValuesAttribute.class); + byte upto = 0; + // test the filled up slots for correctness + for (int i = 0; i < base; i++) { + BytesRef br = bytes.bytes(i); + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs(); + switch (byteIndexValue) { + case BYTES_VAR_STRAIGHT: + case BYTES_FIXED_STRAIGHT: + assertEquals(i, bytesEnum.advance(i)); + // fixed straight returns bytesref with zero bytes all of fixed + // length + assertNotNull("expected none null - " + msg, br); + if(br.length != 0) { + assertEquals("expected zero bytes of length " + bytesSize + " - " + + msg, bytesSize, br.length); + for (int j = 0; j < br.length; j++) { + assertEquals("Byte at index " + j + " doesn't match - " + msg, 0, + br.bytes[br.offset + j]); + } + } + break; + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + case BYTES_VAR_DEREF: + case BYTES_FIXED_DEREF: + default: + assertNotNull("expected none null - " + msg, br); + assertEquals("expected empty bytes -" + msg, 0, br.length); + } + } + final BytesRef enumRef = attr.bytes(); + + + // test the actual doc values added in this iteration + assertEquals(base + numRemainingValues, r.numDocs()); + int v = 0; + for (int i = base; i < r.numDocs(); i++) { + + String msg = " field: " + byteIndexValue.name() + " at index: " + i + + " base: " + base + " numDocs:" + r.numDocs() + " bytesSize: " + bytesSize; + while (withDeletions && deleted.get(v++)) { + upto += bytesSize; + } + + BytesRef br = bytes.bytes(i); + if(bytesEnum.docID() != i) + assertEquals("seek failed for index " + i + " " + msg, i, bytesEnum.advance(i)); + for (int j = 0; j < br.length; j++, upto++) { + assertEquals("EnumRef Byte at index " + j + " doesn't match - " + msg, + upto, enumRef.bytes[enumRef.offset + j]); + assertEquals("SourceRef Byte at index " + j + " doesn't match - " + msg, + upto, br.bytes[br.offset + j]); + } + } + + // clean up + closeables.add(r); + for (Closeable toClose : closeables) { + toClose.close(); + } + } + w.close(); + d.close(); + } + +} Property changes on: src/test/org/apache/lucene/index/values/TestIndexValues.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 983076) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -19,9 +19,13 @@ import java.io.IOException; import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StringWriter; import java.util.Collection; import java.util.Collections; import java.util.Iterator; +import java.util.List; import java.util.Random; import java.util.Map; import java.util.HashMap; @@ -53,6 +57,8 @@ // member initialization vs when it calls super. It seems // like super is called, then our members are initialized: Map openFiles; + Set openInputStreams = new HashSet(); + private synchronized void init() { if (openFiles == null) @@ -229,7 +235,8 @@ fileMap.put(name, file); } - return new MockRAMOutputStream(this, file, name); + MockRAMOutputStream stream = new MockRAMOutputStream(this, file, name); + return stream; } @Override @@ -246,7 +253,9 @@ openFiles.put(name, Integer.valueOf(1)); } } - return new MockRAMInputStream(this, name, file); + MockRAMInputStream stream = new MockRAMInputStream(this, name, file); + openInputStreams.add(stream); + return stream; } /** Provided for testing purposes. Use sizeInBytes() instead. */ @@ -279,7 +288,14 @@ if (noDeleteOpenFile && openFiles.size() > 0) { // RuntimeException instead of IOException because // super() does not throw IOException currently: - throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles); + Set streams = this.openInputStreams; + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + for (MockRAMInputStream stream : streams) { + stream.ex.printStackTrace(pw); + pw.println(); + } + throw new RuntimeException("MockRAMDirectory: cannot close: there are still open files: " + openFiles +" opened at: " + sw.toString() ); } } Index: src/test/org/apache/lucene/store/MockRAMInputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 983076) +++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy) @@ -28,6 +28,7 @@ private MockRAMDirectory dir; private String name; private boolean isClone; + Exception ex; /** Construct an empty output buffer. * @throws IOException */ @@ -35,6 +36,8 @@ super(f); this.name = name; this.dir = dir; + // store the stacktrace + ex = new Exception(); } @Override @@ -45,6 +48,7 @@ // all clones get closed: if (!isClone) { synchronized(dir) { + dir.openInputStreams.remove(this); Integer v = dir.openFiles.get(name); // Could be null when MockRAMDirectory.crash() was called if (v != null) { Index: src/test/org/apache/lucene/store/MockRAMOutputStream.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMOutputStream.java (revision 983076) +++ src/test/org/apache/lucene/store/MockRAMOutputStream.java (working copy) @@ -30,6 +30,8 @@ private MockRAMDirectory dir; private boolean first=true; private final String name; + //TODO(simonw): finish this + final Exception ex; byte[] singleByte = new byte[1]; @@ -38,6 +40,8 @@ super(f); this.dir = dir; this.name = name; + // safe the stacktrace + ex = new Exception(); } @Override Index: src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- src/test/org/apache/lucene/util/_TestUtil.java (revision 983076) +++ src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -116,6 +116,37 @@ } return new String(buffer, 0, end); } + + public static String randomUnicodeString(Random r, int minLength, int maxLength) { + if(minLength > maxLength) + throw new IllegalArgumentException("minLength must be >= maxLength"); + final boolean lenEqual = minLength==maxLength; + final int end = lenEqual?minLength:minLength + r.nextInt(maxLength-minLength+1); + if (end == 0) { + // allow 0 length + return ""; + } + + // TODO(simonw): check this + final int fixedPlane = 5;//minLength % 5; + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = lenEqual? fixedPlane: r.nextInt(5); + //buffer[i] = (char) (97 + r.nextInt(26)); + if (0 == t && i < end - 1 && !lenEqual) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); + // Low surrogate + buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff); + } + else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); + else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800); + else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); + else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff); + } + return new String(buffer, 0, end); + } private static final int[] blockStarts = { 0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, Index: src/test/org/apache/lucene/util/packed/PackedReaderIteratorTest.java =================================================================== --- src/test/org/apache/lucene/util/packed/PackedReaderIteratorTest.java (revision 0) +++ src/test/org/apache/lucene/util/packed/PackedReaderIteratorTest.java (revision 0) @@ -0,0 +1,77 @@ +package org.apache.lucene.util.packed; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util.LuceneTestCaseJ4; +import org.junit.Test; +import static org.junit.Assert.*; +/** + * + * + */ +public class PackedReaderIteratorTest extends LuceneTestCaseJ4 { + + @Test + public void testNext() throws IOException { + Random rnd = newRandom(); + for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) { + long ceil = 2; + for(int nbits=1;nbits<63;nbits++) { + final int valueCount = 100+rnd.nextInt(500); + final Directory d = new MockRAMDirectory(); + + IndexOutput out = d.createOutput("out.bin"); + PackedInts.Writer w = PackedInts.getWriter( + out, valueCount, nbits); + + final long[] values = new long[valueCount]; + for(int i=0;i